The patch titled Subject: selftests/vm: rename selftets/vm to selftests/mm has been added to the -mm mm-unstable branch. Its filename is selftests-vm-rename-selftets-vm-to-selftests-mm.patch This patch will shortly appear at https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/selftests-vm-rename-selftets-vm-to-selftests-mm.patch This patch will later appear in the mm-unstable branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next via the mm-everything branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm and is updated there every 2-3 working days ------------------------------------------------------ From: SeongJae Park <sj@xxxxxxxxxx> Subject: selftests/vm: rename selftets/vm to selftests/mm Date: Tue, 3 Jan 2023 18:07:53 +0000 Rename selftets/vm to selftests/mm for being more consistent with the code, documentation, and tools directories, and won't be confused with virtual machines. Link: https://lkml.kernel.org/r/20230103180754.129637-5-sj@xxxxxxxxxx Signed-off-by: SeongJae Park <sj@xxxxxxxxxx> Cc: Jonathan Corbet <corbet@xxxxxxx> Cc: Shuah Khan <shuah@xxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- Documentation/admin-guide/mm/hugetlbpage.rst | 6 Documentation/core-api/pin_user_pages.rst | 2 MAINTAINERS | 4 mm/Kconfig | 2 tools/testing/selftests/mm/.gitignore | 38 tools/testing/selftests/mm/Makefile | 180 tools/testing/selftests/mm/charge_reserved_hugetlb.sh | 584 ++ tools/testing/selftests/mm/check_config.sh | 31 tools/testing/selftests/mm/compaction_test.c | 231 + tools/testing/selftests/mm/config | 8 tools/testing/selftests/mm/cow.c | 1764 ++++++++ tools/testing/selftests/mm/gup_test.c | 271 + tools/testing/selftests/mm/hmm-tests.c | 2054 ++++++++++ tools/testing/selftests/mm/hugepage-mmap.c | 91 tools/testing/selftests/mm/hugepage-mremap.c | 188 tools/testing/selftests/mm/hugepage-shm.c | 101 tools/testing/selftests/mm/hugepage-vmemmap.c | 144 tools/testing/selftests/mm/hugetlb-madvise.c | 406 + tools/testing/selftests/mm/hugetlb_reparenting_test.sh | 252 + tools/testing/selftests/mm/khugepaged.c | 1558 +++++++ tools/testing/selftests/mm/ksm_functional_tests.c | 374 + tools/testing/selftests/mm/ksm_tests.c | 849 ++++ tools/testing/selftests/mm/madv_populate.c | 296 + tools/testing/selftests/mm/map_fixed_noreplace.c | 231 + tools/testing/selftests/mm/map_hugetlb.c | 109 tools/testing/selftests/mm/map_populate.c | 113 tools/testing/selftests/mm/memfd_secret.c | 296 + tools/testing/selftests/mm/migration.c | 193 tools/testing/selftests/mm/mlock-random-test.c | 294 + tools/testing/selftests/mm/mlock2-tests.c | 520 ++ tools/testing/selftests/mm/mlock2.h | 63 tools/testing/selftests/mm/mrelease_test.c | 206 + tools/testing/selftests/mm/mremap_dontunmap.c | 364 + tools/testing/selftests/mm/mremap_test.c | 550 ++ tools/testing/selftests/mm/on-fault-limit.c | 48 tools/testing/selftests/mm/pkey-helpers.h | 226 + tools/testing/selftests/mm/pkey-powerpc.h | 133 tools/testing/selftests/mm/pkey-x86.h | 177 tools/testing/selftests/mm/protection_keys.c | 1788 ++++++++ tools/testing/selftests/mm/run_vmtests.sh | 274 + tools/testing/selftests/mm/settings | 1 tools/testing/selftests/mm/soft-dirty.c | 210 + tools/testing/selftests/mm/split_huge_page_test.c | 309 + tools/testing/selftests/mm/test_hmm.sh | 105 tools/testing/selftests/mm/test_vmalloc.sh | 177 tools/testing/selftests/mm/thuge-gen.c | 257 + tools/testing/selftests/mm/transhuge-stress.c | 122 tools/testing/selftests/mm/userfaultfd.c | 1858 +++++++++ tools/testing/selftests/mm/util.h | 69 tools/testing/selftests/mm/va_128TBswitch.c | 289 + tools/testing/selftests/mm/va_128TBswitch.sh | 54 tools/testing/selftests/mm/virtual_address_range.c | 139 tools/testing/selftests/mm/vm_util.c | 151 tools/testing/selftests/mm/vm_util.h | 15 tools/testing/selftests/mm/write_hugetlb_memory.sh | 23 tools/testing/selftests/mm/write_to_hugetlbfs.c | 240 + tools/testing/selftests/vm/.gitignore | 38 tools/testing/selftests/vm/Makefile | 180 tools/testing/selftests/vm/charge_reserved_hugetlb.sh | 584 -- tools/testing/selftests/vm/check_config.sh | 31 tools/testing/selftests/vm/compaction_test.c | 231 - tools/testing/selftests/vm/config | 8 tools/testing/selftests/vm/cow.c | 1764 -------- tools/testing/selftests/vm/gup_test.c | 271 - tools/testing/selftests/vm/hmm-tests.c | 2054 ---------- tools/testing/selftests/vm/hugepage-mmap.c | 91 tools/testing/selftests/vm/hugepage-mremap.c | 188 tools/testing/selftests/vm/hugepage-shm.c | 101 tools/testing/selftests/vm/hugepage-vmemmap.c | 144 tools/testing/selftests/vm/hugetlb-madvise.c | 406 - tools/testing/selftests/vm/hugetlb_reparenting_test.sh | 252 - tools/testing/selftests/vm/khugepaged.c | 1558 ------- tools/testing/selftests/vm/ksm_functional_tests.c | 374 - tools/testing/selftests/vm/ksm_tests.c | 849 ---- tools/testing/selftests/vm/madv_populate.c | 296 - tools/testing/selftests/vm/map_fixed_noreplace.c | 231 - tools/testing/selftests/vm/map_hugetlb.c | 109 tools/testing/selftests/vm/map_populate.c | 113 tools/testing/selftests/vm/memfd_secret.c | 296 - tools/testing/selftests/vm/migration.c | 193 tools/testing/selftests/vm/mlock-random-test.c | 294 - tools/testing/selftests/vm/mlock2-tests.c | 520 -- tools/testing/selftests/vm/mlock2.h | 63 tools/testing/selftests/vm/mrelease_test.c | 206 - tools/testing/selftests/vm/mremap_dontunmap.c | 364 - tools/testing/selftests/vm/mremap_test.c | 550 -- tools/testing/selftests/vm/on-fault-limit.c | 48 tools/testing/selftests/vm/pkey-helpers.h | 226 - tools/testing/selftests/vm/pkey-powerpc.h | 133 tools/testing/selftests/vm/pkey-x86.h | 177 tools/testing/selftests/vm/protection_keys.c | 1788 -------- tools/testing/selftests/vm/run_vmtests.sh | 274 - tools/testing/selftests/vm/settings | 1 tools/testing/selftests/vm/soft-dirty.c | 210 - tools/testing/selftests/vm/split_huge_page_test.c | 309 - tools/testing/selftests/vm/test_hmm.sh | 105 tools/testing/selftests/vm/test_vmalloc.sh | 177 tools/testing/selftests/vm/thuge-gen.c | 257 - tools/testing/selftests/vm/transhuge-stress.c | 122 tools/testing/selftests/vm/userfaultfd.c | 1858 --------- tools/testing/selftests/vm/util.h | 69 tools/testing/selftests/vm/va_128TBswitch.c | 289 - tools/testing/selftests/vm/va_128TBswitch.sh | 54 tools/testing/selftests/vm/virtual_address_range.c | 139 tools/testing/selftests/vm/vm_util.c | 151 tools/testing/selftests/vm/vm_util.h | 15 tools/testing/selftests/vm/write_hugetlb_memory.sh | 23 tools/testing/selftests/vm/write_to_hugetlbfs.c | 240 - 108 files changed, 19031 insertions(+), 19031 deletions(-) --- a/Documentation/admin-guide/mm/hugetlbpage.rst~selftests-vm-rename-selftets-vm-to-selftests-mm +++ a/Documentation/admin-guide/mm/hugetlbpage.rst @@ -461,13 +461,13 @@ Examples .. _map_hugetlb: ``map_hugetlb`` - see tools/testing/selftests/vm/map_hugetlb.c + see tools/testing/selftests/mm/map_hugetlb.c ``hugepage-shm`` - see tools/testing/selftests/vm/hugepage-shm.c + see tools/testing/selftests/mm/hugepage-shm.c ``hugepage-mmap`` - see tools/testing/selftests/vm/hugepage-mmap.c + see tools/testing/selftests/mm/hugepage-mmap.c The `libhugetlbfs`_ library provides a wide range of userspace tools to help with huge page usability, environment setup, and control. --- a/Documentation/core-api/pin_user_pages.rst~selftests-vm-rename-selftets-vm-to-selftests-mm +++ a/Documentation/core-api/pin_user_pages.rst @@ -221,7 +221,7 @@ Unit testing ============ This file:: - tools/testing/selftests/vm/gup_test.c + tools/testing/selftests/mm/gup_test.c has the following new calls to exercise the new pin*() wrapper functions: --- a/MAINTAINERS~selftests-vm-rename-selftets-vm-to-selftests-mm +++ a/MAINTAINERS @@ -9466,7 +9466,7 @@ F: Documentation/mm/hmm.rst F: include/linux/hmm* F: lib/test_hmm* F: mm/hmm* -F: tools/testing/selftests/vm/*hmm* +F: tools/testing/selftests/mm/*hmm* HOST AP DRIVER M: Jouni Malinen <j@xxxxx> @@ -13484,7 +13484,7 @@ F: include/linux/mmzone.h F: include/linux/pagewalk.h F: mm/ F: tools/mm/ -F: tools/testing/selftests/vm/ +F: tools/testing/selftests/mm/ VMALLOC M: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- a/mm/Kconfig~selftests-vm-rename-selftets-vm-to-selftests-mm +++ a/mm/Kconfig @@ -1073,7 +1073,7 @@ config GUP_TEST pin_user_pages*(), or pinned via get_user_pages*(), as specified by other command line arguments. - See tools/testing/selftests/vm/gup_test.c + See tools/testing/selftests/mm/gup_test.c comment "GUP_TEST needs to have DEBUG_FS enabled" depends on !GUP_TEST && !DEBUG_FS --- /dev/null +++ a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh @@ -0,0 +1,584 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +set -e + +if [[ $(id -u) -ne 0 ]]; then + echo "This test must be run as root. Skipping..." + exit $ksft_skip +fi + +fault_limit_file=limit_in_bytes +reservation_limit_file=rsvd.limit_in_bytes +fault_usage_file=usage_in_bytes +reservation_usage_file=rsvd.usage_in_bytes + +if [[ "$1" == "-cgroup-v2" ]]; then + cgroup2=1 + fault_limit_file=max + reservation_limit_file=rsvd.max + fault_usage_file=current + reservation_usage_file=rsvd.current +fi + +if [[ $cgroup2 ]]; then + cgroup_path=$(mount -t cgroup2 | head -1 | awk -e '{print $3}') + if [[ -z "$cgroup_path" ]]; then + cgroup_path=/dev/cgroup/memory + mount -t cgroup2 none $cgroup_path + do_umount=1 + fi + echo "+hugetlb" >$cgroup_path/cgroup.subtree_control +else + cgroup_path=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}') + if [[ -z "$cgroup_path" ]]; then + cgroup_path=/dev/cgroup/memory + mount -t cgroup memory,hugetlb $cgroup_path + do_umount=1 + fi +fi +export cgroup_path + +function cleanup() { + if [[ $cgroup2 ]]; then + echo $$ >$cgroup_path/cgroup.procs + else + echo $$ >$cgroup_path/tasks + fi + + if [[ -e /mnt/huge ]]; then + rm -rf /mnt/huge/* + umount /mnt/huge || echo error + rmdir /mnt/huge + fi + if [[ -e $cgroup_path/hugetlb_cgroup_test ]]; then + rmdir $cgroup_path/hugetlb_cgroup_test + fi + if [[ -e $cgroup_path/hugetlb_cgroup_test1 ]]; then + rmdir $cgroup_path/hugetlb_cgroup_test1 + fi + if [[ -e $cgroup_path/hugetlb_cgroup_test2 ]]; then + rmdir $cgroup_path/hugetlb_cgroup_test2 + fi + echo 0 >/proc/sys/vm/nr_hugepages + echo CLEANUP DONE +} + +function expect_equal() { + local expected="$1" + local actual="$2" + local error="$3" + + if [[ "$expected" != "$actual" ]]; then + echo "expected ($expected) != actual ($actual): $3" + cleanup + exit 1 + fi +} + +function get_machine_hugepage_size() { + hpz=$(grep -i hugepagesize /proc/meminfo) + kb=${hpz:14:-3} + mb=$(($kb / 1024)) + echo $mb +} + +MB=$(get_machine_hugepage_size) + +function setup_cgroup() { + local name="$1" + local cgroup_limit="$2" + local reservation_limit="$3" + + mkdir $cgroup_path/$name + + echo writing cgroup limit: "$cgroup_limit" + echo "$cgroup_limit" >$cgroup_path/$name/hugetlb.${MB}MB.$fault_limit_file + + echo writing reseravation limit: "$reservation_limit" + echo "$reservation_limit" > \ + $cgroup_path/$name/hugetlb.${MB}MB.$reservation_limit_file + + if [ -e "$cgroup_path/$name/cpuset.cpus" ]; then + echo 0 >$cgroup_path/$name/cpuset.cpus + fi + if [ -e "$cgroup_path/$name/cpuset.mems" ]; then + echo 0 >$cgroup_path/$name/cpuset.mems + fi +} + +function wait_for_hugetlb_memory_to_get_depleted() { + local cgroup="$1" + local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" + # Wait for hugetlbfs memory to get depleted. + while [ $(cat $path) != 0 ]; do + echo Waiting for hugetlb memory to get depleted. + cat $path + sleep 0.5 + done +} + +function wait_for_hugetlb_memory_to_get_reserved() { + local cgroup="$1" + local size="$2" + + local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" + # Wait for hugetlbfs memory to get written. + while [ $(cat $path) != $size ]; do + echo Waiting for hugetlb memory reservation to reach size $size. + cat $path + sleep 0.5 + done +} + +function wait_for_hugetlb_memory_to_get_written() { + local cgroup="$1" + local size="$2" + + local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file" + # Wait for hugetlbfs memory to get written. + while [ $(cat $path) != $size ]; do + echo Waiting for hugetlb memory to reach size $size. + cat $path + sleep 0.5 + done +} + +function write_hugetlbfs_and_get_usage() { + local cgroup="$1" + local size="$2" + local populate="$3" + local write="$4" + local path="$5" + local method="$6" + local private="$7" + local expect_failure="$8" + local reserve="$9" + + # Function return values. + reservation_failed=0 + oom_killed=0 + hugetlb_difference=0 + reserved_difference=0 + + local hugetlb_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file + local reserved_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file + + local hugetlb_before=$(cat $hugetlb_usage) + local reserved_before=$(cat $reserved_usage) + + echo + echo Starting: + echo hugetlb_usage="$hugetlb_before" + echo reserved_usage="$reserved_before" + echo expect_failure is "$expect_failure" + + output=$(mktemp) + set +e + if [[ "$method" == "1" ]] || [[ "$method" == 2 ]] || + [[ "$private" == "-r" ]] && [[ "$expect_failure" != 1 ]]; then + + bash write_hugetlb_memory.sh "$size" "$populate" "$write" \ + "$cgroup" "$path" "$method" "$private" "-l" "$reserve" 2>&1 | tee $output & + + local write_result=$? + local write_pid=$! + + until grep -q -i "DONE" $output; do + echo waiting for DONE signal. + if ! ps $write_pid > /dev/null + then + echo "FAIL: The write died" + cleanup + exit 1 + fi + sleep 0.5 + done + + echo ================= write_hugetlb_memory.sh output is: + cat $output + echo ================= end output. + + if [[ "$populate" == "-o" ]] || [[ "$write" == "-w" ]]; then + wait_for_hugetlb_memory_to_get_written "$cgroup" "$size" + elif [[ "$reserve" != "-n" ]]; then + wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size" + else + # This case doesn't produce visible effects, but we still have + # to wait for the async process to start and execute... + sleep 0.5 + fi + + echo write_result is $write_result + else + bash write_hugetlb_memory.sh "$size" "$populate" "$write" \ + "$cgroup" "$path" "$method" "$private" "$reserve" + local write_result=$? + + if [[ "$reserve" != "-n" ]]; then + wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size" + fi + fi + set -e + + if [[ "$write_result" == 1 ]]; then + reservation_failed=1 + fi + + # On linus/master, the above process gets SIGBUS'd on oomkill, with + # return code 135. On earlier kernels, it gets actual oomkill, with return + # code 137, so just check for both conditions in case we're testing + # against an earlier kernel. + if [[ "$write_result" == 135 ]] || [[ "$write_result" == 137 ]]; then + oom_killed=1 + fi + + local hugetlb_after=$(cat $hugetlb_usage) + local reserved_after=$(cat $reserved_usage) + + echo After write: + echo hugetlb_usage="$hugetlb_after" + echo reserved_usage="$reserved_after" + + hugetlb_difference=$(($hugetlb_after - $hugetlb_before)) + reserved_difference=$(($reserved_after - $reserved_before)) +} + +function cleanup_hugetlb_memory() { + set +e + local cgroup="$1" + if [[ "$(pgrep -f write_to_hugetlbfs)" != "" ]]; then + echo killing write_to_hugetlbfs + killall -2 write_to_hugetlbfs + wait_for_hugetlb_memory_to_get_depleted $cgroup + fi + set -e + + if [[ -e /mnt/huge ]]; then + rm -rf /mnt/huge/* + umount /mnt/huge + rmdir /mnt/huge + fi +} + +function run_test() { + local size=$(($1 * ${MB} * 1024 * 1024)) + local populate="$2" + local write="$3" + local cgroup_limit=$(($4 * ${MB} * 1024 * 1024)) + local reservation_limit=$(($5 * ${MB} * 1024 * 1024)) + local nr_hugepages="$6" + local method="$7" + local private="$8" + local expect_failure="$9" + local reserve="${10}" + + # Function return values. + hugetlb_difference=0 + reserved_difference=0 + reservation_failed=0 + oom_killed=0 + + echo nr hugepages = "$nr_hugepages" + echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages + + setup_cgroup "hugetlb_cgroup_test" "$cgroup_limit" "$reservation_limit" + + mkdir -p /mnt/huge + mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge + + write_hugetlbfs_and_get_usage "hugetlb_cgroup_test" "$size" "$populate" \ + "$write" "/mnt/huge/test" "$method" "$private" "$expect_failure" \ + "$reserve" + + cleanup_hugetlb_memory "hugetlb_cgroup_test" + + local final_hugetlb=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$fault_usage_file) + local final_reservation=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$reservation_usage_file) + + echo $hugetlb_difference + echo $reserved_difference + expect_equal "0" "$final_hugetlb" "final hugetlb is not zero" + expect_equal "0" "$final_reservation" "final reservation is not zero" +} + +function run_multiple_cgroup_test() { + local size1="$1" + local populate1="$2" + local write1="$3" + local cgroup_limit1="$4" + local reservation_limit1="$5" + + local size2="$6" + local populate2="$7" + local write2="$8" + local cgroup_limit2="$9" + local reservation_limit2="${10}" + + local nr_hugepages="${11}" + local method="${12}" + local private="${13}" + local expect_failure="${14}" + local reserve="${15}" + + # Function return values. + hugetlb_difference1=0 + reserved_difference1=0 + reservation_failed1=0 + oom_killed1=0 + + hugetlb_difference2=0 + reserved_difference2=0 + reservation_failed2=0 + oom_killed2=0 + + echo nr hugepages = "$nr_hugepages" + echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages + + setup_cgroup "hugetlb_cgroup_test1" "$cgroup_limit1" "$reservation_limit1" + setup_cgroup "hugetlb_cgroup_test2" "$cgroup_limit2" "$reservation_limit2" + + mkdir -p /mnt/huge + mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge + + write_hugetlbfs_and_get_usage "hugetlb_cgroup_test1" "$size1" \ + "$populate1" "$write1" "/mnt/huge/test1" "$method" "$private" \ + "$expect_failure" "$reserve" + + hugetlb_difference1=$hugetlb_difference + reserved_difference1=$reserved_difference + reservation_failed1=$reservation_failed + oom_killed1=$oom_killed + + local cgroup1_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$fault_usage_file + local cgroup1_reservation_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$reservation_usage_file + local cgroup2_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$fault_usage_file + local cgroup2_reservation_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$reservation_usage_file + + local usage_before_second_write=$(cat $cgroup1_hugetlb_usage) + local reservation_usage_before_second_write=$(cat $cgroup1_reservation_usage) + + write_hugetlbfs_and_get_usage "hugetlb_cgroup_test2" "$size2" \ + "$populate2" "$write2" "/mnt/huge/test2" "$method" "$private" \ + "$expect_failure" "$reserve" + + hugetlb_difference2=$hugetlb_difference + reserved_difference2=$reserved_difference + reservation_failed2=$reservation_failed + oom_killed2=$oom_killed + + expect_equal "$usage_before_second_write" \ + "$(cat $cgroup1_hugetlb_usage)" "Usage changed." + expect_equal "$reservation_usage_before_second_write" \ + "$(cat $cgroup1_reservation_usage)" "Reservation usage changed." + + cleanup_hugetlb_memory + + local final_hugetlb=$(cat $cgroup1_hugetlb_usage) + local final_reservation=$(cat $cgroup1_reservation_usage) + + expect_equal "0" "$final_hugetlb" \ + "hugetlbt_cgroup_test1 final hugetlb is not zero" + expect_equal "0" "$final_reservation" \ + "hugetlbt_cgroup_test1 final reservation is not zero" + + local final_hugetlb=$(cat $cgroup2_hugetlb_usage) + local final_reservation=$(cat $cgroup2_reservation_usage) + + expect_equal "0" "$final_hugetlb" \ + "hugetlb_cgroup_test2 final hugetlb is not zero" + expect_equal "0" "$final_reservation" \ + "hugetlb_cgroup_test2 final reservation is not zero" +} + +cleanup + +for populate in "" "-o"; do + for method in 0 1 2; do + for private in "" "-r"; do + for reserve in "" "-n"; do + + # Skip mmap(MAP_HUGETLB | MAP_SHARED). Doesn't seem to be supported. + if [[ "$method" == 1 ]] && [[ "$private" == "" ]]; then + continue + fi + + # Skip populated shmem tests. Doesn't seem to be supported. + if [[ "$method" == 2"" ]] && [[ "$populate" == "-o" ]]; then + continue + fi + + if [[ "$method" == 2"" ]] && [[ "$reserve" == "-n" ]]; then + continue + fi + + cleanup + echo + echo + echo + echo Test normal case. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + run_test 5 "$populate" "" 10 10 10 "$method" "$private" "0" "$reserve" + + echo Memory charged to hugtlb=$hugetlb_difference + echo Memory charged to reservation=$reserved_difference + + if [[ "$populate" == "-o" ]]; then + expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \ + "Reserved memory charged to hugetlb cgroup." + else + expect_equal "0" "$hugetlb_difference" \ + "Reserved memory charged to hugetlb cgroup." + fi + + if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then + expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \ + "Reserved memory not charged to reservation usage." + else + expect_equal "0" "$reserved_difference" \ + "Reserved memory not charged to reservation usage." + fi + + echo 'PASS' + + cleanup + echo + echo + echo + echo Test normal case with write. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + run_test 5 "$populate" '-w' 5 5 10 "$method" "$private" "0" "$reserve" + + echo Memory charged to hugtlb=$hugetlb_difference + echo Memory charged to reservation=$reserved_difference + + expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \ + "Reserved memory charged to hugetlb cgroup." + + expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \ + "Reserved memory not charged to reservation usage." + + echo 'PASS' + + cleanup + continue + echo + echo + echo + echo Test more than reservation case. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + + if [ "$reserve" != "-n" ]; then + run_test "5" "$populate" '' "10" "2" "10" "$method" "$private" "1" \ + "$reserve" + + expect_equal "1" "$reservation_failed" "Reservation succeeded." + fi + + echo 'PASS' + + cleanup + + echo + echo + echo + echo Test more than cgroup limit case. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + + # Not sure if shm memory can be cleaned up when the process gets sigbus'd. + if [[ "$method" != 2 ]]; then + run_test 5 "$populate" "-w" 2 10 10 "$method" "$private" "1" "$reserve" + + expect_equal "1" "$oom_killed" "Not oom killed." + fi + echo 'PASS' + + cleanup + + echo + echo + echo + echo Test normal case, multiple cgroups. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + run_multiple_cgroup_test "3" "$populate" "" "10" "10" "5" \ + "$populate" "" "10" "10" "10" \ + "$method" "$private" "0" "$reserve" + + echo Memory charged to hugtlb1=$hugetlb_difference1 + echo Memory charged to reservation1=$reserved_difference1 + echo Memory charged to hugtlb2=$hugetlb_difference2 + echo Memory charged to reservation2=$reserved_difference2 + + if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then + expect_equal "3" "$reserved_difference1" \ + "Incorrect reservations charged to cgroup 1." + + expect_equal "5" "$reserved_difference2" \ + "Incorrect reservation charged to cgroup 2." + + else + expect_equal "0" "$reserved_difference1" \ + "Incorrect reservations charged to cgroup 1." + + expect_equal "0" "$reserved_difference2" \ + "Incorrect reservation charged to cgroup 2." + fi + + if [[ "$populate" == "-o" ]]; then + expect_equal "3" "$hugetlb_difference1" \ + "Incorrect hugetlb charged to cgroup 1." + + expect_equal "5" "$hugetlb_difference2" \ + "Incorrect hugetlb charged to cgroup 2." + + else + expect_equal "0" "$hugetlb_difference1" \ + "Incorrect hugetlb charged to cgroup 1." + + expect_equal "0" "$hugetlb_difference2" \ + "Incorrect hugetlb charged to cgroup 2." + fi + echo 'PASS' + + cleanup + echo + echo + echo + echo Test normal case with write, multiple cgroups. + echo private=$private, populate=$populate, method=$method, reserve=$reserve + run_multiple_cgroup_test "3" "$populate" "-w" "10" "10" "5" \ + "$populate" "-w" "10" "10" "10" \ + "$method" "$private" "0" "$reserve" + + echo Memory charged to hugtlb1=$hugetlb_difference1 + echo Memory charged to reservation1=$reserved_difference1 + echo Memory charged to hugtlb2=$hugetlb_difference2 + echo Memory charged to reservation2=$reserved_difference2 + + expect_equal "3" "$hugetlb_difference1" \ + "Incorrect hugetlb charged to cgroup 1." + + expect_equal "3" "$reserved_difference1" \ + "Incorrect reservation charged to cgroup 1." + + expect_equal "5" "$hugetlb_difference2" \ + "Incorrect hugetlb charged to cgroup 2." + + expect_equal "5" "$reserved_difference2" \ + "Incorrected reservation charged to cgroup 2." + echo 'PASS' + + cleanup + + done # reserve + done # private + done # populate +done # method + +if [[ $do_umount ]]; then + umount $cgroup_path + rmdir $cgroup_path +fi --- /dev/null +++ a/tools/testing/selftests/mm/check_config.sh @@ -0,0 +1,31 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 +# +# Probe for libraries and create header files to record the results. Both C +# header files and Makefile include fragments are created. + +OUTPUT_H_FILE=local_config.h +OUTPUT_MKFILE=local_config.mk + +tmpname=$(mktemp) +tmpfile_c=${tmpname}.c +tmpfile_o=${tmpname}.o + +# liburing +echo "#include <sys/types.h>" > $tmpfile_c +echo "#include <liburing.h>" >> $tmpfile_c +echo "int func(void) { return 0; }" >> $tmpfile_c + +CC=${1:?"Usage: $0 <compiler> # example compiler: gcc"} +$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1 + +if [ -f $tmpfile_o ]; then + echo "#define LOCAL_CONFIG_HAVE_LIBURING 1" > $OUTPUT_H_FILE + echo "COW_EXTRA_LIBS = -luring" > $OUTPUT_MKFILE +else + echo "// No liburing support found" > $OUTPUT_H_FILE + echo "# No liburing support found, so:" > $OUTPUT_MKFILE + echo "COW_EXTRA_LIBS = " >> $OUTPUT_MKFILE +fi + +rm ${tmpname}.* --- /dev/null +++ a/tools/testing/selftests/mm/compaction_test.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * + * A test for the patch "Allow compaction of unevictable pages". + * With this patch we should be able to allocate at least 1/4 + * of RAM in huge pages. Without the patch much less is + * allocated. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <sys/mman.h> +#include <sys/resource.h> +#include <fcntl.h> +#include <errno.h> +#include <unistd.h> +#include <string.h> + +#include "../kselftest.h" + +#define MAP_SIZE_MB 100 +#define MAP_SIZE (MAP_SIZE_MB * 1024 * 1024) + +struct map_list { + void *map; + struct map_list *next; +}; + +int read_memory_info(unsigned long *memfree, unsigned long *hugepagesize) +{ + char buffer[256] = {0}; + char *cmd = "cat /proc/meminfo | grep -i memfree | grep -o '[0-9]*'"; + FILE *cmdfile = popen(cmd, "r"); + + if (!(fgets(buffer, sizeof(buffer), cmdfile))) { + perror("Failed to read meminfo\n"); + return -1; + } + + pclose(cmdfile); + + *memfree = atoll(buffer); + cmd = "cat /proc/meminfo | grep -i hugepagesize | grep -o '[0-9]*'"; + cmdfile = popen(cmd, "r"); + + if (!(fgets(buffer, sizeof(buffer), cmdfile))) { + perror("Failed to read meminfo\n"); + return -1; + } + + pclose(cmdfile); + *hugepagesize = atoll(buffer); + + return 0; +} + +int prereq(void) +{ + char allowed; + int fd; + + fd = open("/proc/sys/vm/compact_unevictable_allowed", + O_RDONLY | O_NONBLOCK); + if (fd < 0) { + perror("Failed to open\n" + "/proc/sys/vm/compact_unevictable_allowed\n"); + return -1; + } + + if (read(fd, &allowed, sizeof(char)) != sizeof(char)) { + perror("Failed to read from\n" + "/proc/sys/vm/compact_unevictable_allowed\n"); + close(fd); + return -1; + } + + close(fd); + if (allowed == '1') + return 0; + + return -1; +} + +int check_compaction(unsigned long mem_free, unsigned int hugepage_size) +{ + int fd; + int compaction_index = 0; + char initial_nr_hugepages[10] = {0}; + char nr_hugepages[10] = {0}; + + /* We want to test with 80% of available memory. Else, OOM killer comes + in to play */ + mem_free = mem_free * 0.8; + + fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK); + if (fd < 0) { + perror("Failed to open /proc/sys/vm/nr_hugepages"); + return -1; + } + + if (read(fd, initial_nr_hugepages, sizeof(initial_nr_hugepages)) <= 0) { + perror("Failed to read from /proc/sys/vm/nr_hugepages"); + goto close_fd; + } + + /* Start with the initial condition of 0 huge pages*/ + if (write(fd, "0", sizeof(char)) != sizeof(char)) { + perror("Failed to write 0 to /proc/sys/vm/nr_hugepages\n"); + goto close_fd; + } + + lseek(fd, 0, SEEK_SET); + + /* Request a large number of huge pages. The Kernel will allocate + as much as it can */ + if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) { + perror("Failed to write 100000 to /proc/sys/vm/nr_hugepages\n"); + goto close_fd; + } + + lseek(fd, 0, SEEK_SET); + + if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) { + perror("Failed to re-read from /proc/sys/vm/nr_hugepages\n"); + goto close_fd; + } + + /* We should have been able to request at least 1/3 rd of the memory in + huge pages */ + compaction_index = mem_free/(atoi(nr_hugepages) * hugepage_size); + + if (compaction_index > 3) { + printf("No of huge pages allocated = %d\n", + (atoi(nr_hugepages))); + fprintf(stderr, "ERROR: Less that 1/%d of memory is available\n" + "as huge pages\n", compaction_index); + goto close_fd; + } + + printf("No of huge pages allocated = %d\n", + (atoi(nr_hugepages))); + + lseek(fd, 0, SEEK_SET); + + if (write(fd, initial_nr_hugepages, strlen(initial_nr_hugepages)) + != strlen(initial_nr_hugepages)) { + perror("Failed to write value to /proc/sys/vm/nr_hugepages\n"); + goto close_fd; + } + + close(fd); + return 0; + + close_fd: + close(fd); + printf("Not OK. Compaction test failed."); + return -1; +} + + +int main(int argc, char **argv) +{ + struct rlimit lim; + struct map_list *list, *entry; + size_t page_size, i; + void *map = NULL; + unsigned long mem_free = 0; + unsigned long hugepage_size = 0; + long mem_fragmentable_MB = 0; + + if (prereq() != 0) { + printf("Either the sysctl compact_unevictable_allowed is not\n" + "set to 1 or couldn't read the proc file.\n" + "Skipping the test\n"); + return KSFT_SKIP; + } + + lim.rlim_cur = RLIM_INFINITY; + lim.rlim_max = RLIM_INFINITY; + if (setrlimit(RLIMIT_MEMLOCK, &lim)) { + perror("Failed to set rlimit:\n"); + return -1; + } + + page_size = getpagesize(); + + list = NULL; + + if (read_memory_info(&mem_free, &hugepage_size) != 0) { + printf("ERROR: Cannot read meminfo\n"); + return -1; + } + + mem_fragmentable_MB = mem_free * 0.8 / 1024; + + while (mem_fragmentable_MB > 0) { + map = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED, -1, 0); + if (map == MAP_FAILED) + break; + + entry = malloc(sizeof(struct map_list)); + if (!entry) { + munmap(map, MAP_SIZE); + break; + } + entry->map = map; + entry->next = list; + list = entry; + + /* Write something (in this case the address of the map) to + * ensure that KSM can't merge the mapped pages + */ + for (i = 0; i < MAP_SIZE; i += page_size) + *(unsigned long *)(map + i) = (unsigned long)map + i; + + mem_fragmentable_MB -= MAP_SIZE_MB; + } + + for (entry = list; entry != NULL; entry = entry->next) { + munmap(entry->map, MAP_SIZE); + if (!entry->next) + break; + entry = entry->next; + } + + if (check_compaction(mem_free, hugepage_size) == 0) + return 0; + + return -1; +} --- /dev/null +++ a/tools/testing/selftests/mm/config @@ -0,0 +1,8 @@ +CONFIG_SYSVIPC=y +CONFIG_USERFAULTFD=y +CONFIG_TEST_VMALLOC=m +CONFIG_DEVICE_PRIVATE=y +CONFIG_TEST_HMM=m +CONFIG_GUP_TEST=y +CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_MEM_SOFT_DIRTY=y --- /dev/null +++ a/tools/testing/selftests/mm/cow.c @@ -0,0 +1,1764 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * COW (Copy On Write) tests. + * + * Copyright 2022, Red Hat, Inc. + * + * Author(s): David Hildenbrand <david@xxxxxxxxxx> + */ +#define _GNU_SOURCE +#include <stdlib.h> +#include <string.h> +#include <stdbool.h> +#include <stdint.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <dirent.h> +#include <assert.h> +#include <sys/mman.h> +#include <sys/ioctl.h> +#include <sys/wait.h> +#include <linux/memfd.h> + +#include "local_config.h" +#ifdef LOCAL_CONFIG_HAVE_LIBURING +#include <liburing.h> +#endif /* LOCAL_CONFIG_HAVE_LIBURING */ + +#include "../../../../mm/gup_test.h" +#include "../kselftest.h" +#include "vm_util.h" + +#ifndef MADV_COLLAPSE +#define MADV_COLLAPSE 25 +#endif + +static size_t pagesize; +static int pagemap_fd; +static size_t thpsize; +static int nr_hugetlbsizes; +static size_t hugetlbsizes[10]; +static int gup_fd; +static bool has_huge_zeropage; + +static void detect_thpsize(void) +{ + int fd = open("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", + O_RDONLY); + size_t size = 0; + char buf[15]; + int ret; + + if (fd < 0) + return; + + ret = pread(fd, buf, sizeof(buf), 0); + if (ret > 0 && ret < sizeof(buf)) { + buf[ret] = 0; + + size = strtoul(buf, NULL, 10); + if (size < pagesize) + size = 0; + if (size > 0) { + thpsize = size; + ksft_print_msg("[INFO] detected THP size: %zu KiB\n", + thpsize / 1024); + } + } + + close(fd); +} + +static void detect_huge_zeropage(void) +{ + int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page", + O_RDONLY); + size_t enabled = 0; + char buf[15]; + int ret; + + if (fd < 0) + return; + + ret = pread(fd, buf, sizeof(buf), 0); + if (ret > 0 && ret < sizeof(buf)) { + buf[ret] = 0; + + enabled = strtoul(buf, NULL, 10); + if (enabled == 1) { + has_huge_zeropage = true; + ksft_print_msg("[INFO] huge zeropage is enabled\n"); + } + } + + close(fd); +} + +static void detect_hugetlbsizes(void) +{ + DIR *dir = opendir("/sys/kernel/mm/hugepages/"); + + if (!dir) + return; + + while (nr_hugetlbsizes < ARRAY_SIZE(hugetlbsizes)) { + struct dirent *entry = readdir(dir); + size_t kb; + + if (!entry) + break; + if (entry->d_type != DT_DIR) + continue; + if (sscanf(entry->d_name, "hugepages-%zukB", &kb) != 1) + continue; + hugetlbsizes[nr_hugetlbsizes] = kb * 1024; + nr_hugetlbsizes++; + ksft_print_msg("[INFO] detected hugetlb size: %zu KiB\n", + kb); + } + closedir(dir); +} + +static bool range_is_swapped(void *addr, size_t size) +{ + for (; size; addr += pagesize, size -= pagesize) + if (!pagemap_is_swapped(pagemap_fd, addr)) + return false; + return true; +} + +struct comm_pipes { + int child_ready[2]; + int parent_ready[2]; +}; + +static int setup_comm_pipes(struct comm_pipes *comm_pipes) +{ + if (pipe(comm_pipes->child_ready) < 0) + return -errno; + if (pipe(comm_pipes->parent_ready) < 0) { + close(comm_pipes->child_ready[0]); + close(comm_pipes->child_ready[1]); + return -errno; + } + + return 0; +} + +static void close_comm_pipes(struct comm_pipes *comm_pipes) +{ + close(comm_pipes->child_ready[0]); + close(comm_pipes->child_ready[1]); + close(comm_pipes->parent_ready[0]); + close(comm_pipes->parent_ready[1]); +} + +static int child_memcmp_fn(char *mem, size_t size, + struct comm_pipes *comm_pipes) +{ + char *old = malloc(size); + char buf; + + /* Backup the original content. */ + memcpy(old, mem, size); + + /* Wait until the parent modified the page. */ + write(comm_pipes->child_ready[1], "0", 1); + while (read(comm_pipes->parent_ready[0], &buf, 1) != 1) + ; + + /* See if we still read the old values. */ + return memcmp(old, mem, size); +} + +static int child_vmsplice_memcmp_fn(char *mem, size_t size, + struct comm_pipes *comm_pipes) +{ + struct iovec iov = { + .iov_base = mem, + .iov_len = size, + }; + ssize_t cur, total, transferred; + char *old, *new; + int fds[2]; + char buf; + + old = malloc(size); + new = malloc(size); + + /* Backup the original content. */ + memcpy(old, mem, size); + + if (pipe(fds) < 0) + return -errno; + + /* Trigger a read-only pin. */ + transferred = vmsplice(fds[1], &iov, 1, 0); + if (transferred < 0) + return -errno; + if (transferred == 0) + return -EINVAL; + + /* Unmap it from our page tables. */ + if (munmap(mem, size) < 0) + return -errno; + + /* Wait until the parent modified it. */ + write(comm_pipes->child_ready[1], "0", 1); + while (read(comm_pipes->parent_ready[0], &buf, 1) != 1) + ; + + /* See if we still read the old values via the pipe. */ + for (total = 0; total < transferred; total += cur) { + cur = read(fds[0], new + total, transferred - total); + if (cur < 0) + return -errno; + } + + return memcmp(old, new, transferred); +} + +typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes); + +static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect, + child_fn fn) +{ + struct comm_pipes comm_pipes; + char buf; + int ret; + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + return; + } + + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto close_comm_pipes; + } else if (!ret) { + exit(fn(mem, size, &comm_pipes)); + } + + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + + if (do_mprotect) { + /* + * mprotect() optimizations might try avoiding + * write-faults by directly mapping pages writable. + */ + ret = mprotect(mem, size, PROT_READ); + ret |= mprotect(mem, size, PROT_READ|PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + goto close_comm_pipes; + } + } + + /* Modify the page. */ + memset(mem, 0xff, size); + write(comm_pipes.parent_ready[1], "0", 1); + + wait(&ret); + if (WIFEXITED(ret)) + ret = WEXITSTATUS(ret); + else + ret = -EINVAL; + + ksft_test_result(!ret, "No leak from parent into child\n"); +close_comm_pipes: + close_comm_pipes(&comm_pipes); +} + +static void test_cow_in_parent(char *mem, size_t size) +{ + do_test_cow_in_parent(mem, size, false, child_memcmp_fn); +} + +static void test_cow_in_parent_mprotect(char *mem, size_t size) +{ + do_test_cow_in_parent(mem, size, true, child_memcmp_fn); +} + +static void test_vmsplice_in_child(char *mem, size_t size) +{ + do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn); +} + +static void test_vmsplice_in_child_mprotect(char *mem, size_t size) +{ + do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn); +} + +static void do_test_vmsplice_in_parent(char *mem, size_t size, + bool before_fork) +{ + struct iovec iov = { + .iov_base = mem, + .iov_len = size, + }; + ssize_t cur, total, transferred; + struct comm_pipes comm_pipes; + char *old, *new; + int ret, fds[2]; + char buf; + + old = malloc(size); + new = malloc(size); + + memcpy(old, mem, size); + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + goto free; + } + + if (pipe(fds) < 0) { + ksft_test_result_fail("pipe() failed\n"); + goto close_comm_pipes; + } + + if (before_fork) { + transferred = vmsplice(fds[1], &iov, 1, 0); + if (transferred <= 0) { + ksft_test_result_fail("vmsplice() failed\n"); + goto close_pipe; + } + } + + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto close_pipe; + } else if (!ret) { + write(comm_pipes.child_ready[1], "0", 1); + while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) + ; + /* Modify page content in the child. */ + memset(mem, 0xff, size); + exit(0); + } + + if (!before_fork) { + transferred = vmsplice(fds[1], &iov, 1, 0); + if (transferred <= 0) { + ksft_test_result_fail("vmsplice() failed\n"); + wait(&ret); + goto close_pipe; + } + } + + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + if (munmap(mem, size) < 0) { + ksft_test_result_fail("munmap() failed\n"); + goto close_pipe; + } + write(comm_pipes.parent_ready[1], "0", 1); + + /* Wait until the child is done writing. */ + wait(&ret); + if (!WIFEXITED(ret)) { + ksft_test_result_fail("wait() failed\n"); + goto close_pipe; + } + + /* See if we still read the old values. */ + for (total = 0; total < transferred; total += cur) { + cur = read(fds[0], new + total, transferred - total); + if (cur < 0) { + ksft_test_result_fail("read() failed\n"); + goto close_pipe; + } + } + + ksft_test_result(!memcmp(old, new, transferred), + "No leak from child into parent\n"); +close_pipe: + close(fds[0]); + close(fds[1]); +close_comm_pipes: + close_comm_pipes(&comm_pipes); +free: + free(old); + free(new); +} + +static void test_vmsplice_before_fork(char *mem, size_t size) +{ + do_test_vmsplice_in_parent(mem, size, true); +} + +static void test_vmsplice_after_fork(char *mem, size_t size) +{ + do_test_vmsplice_in_parent(mem, size, false); +} + +#ifdef LOCAL_CONFIG_HAVE_LIBURING +static void do_test_iouring(char *mem, size_t size, bool use_fork) +{ + struct comm_pipes comm_pipes; + struct io_uring_cqe *cqe; + struct io_uring_sqe *sqe; + struct io_uring ring; + ssize_t cur, total; + struct iovec iov; + char *buf, *tmp; + int ret, fd; + FILE *file; + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + return; + } + + file = tmpfile(); + if (!file) { + ksft_test_result_fail("tmpfile() failed\n"); + goto close_comm_pipes; + } + fd = fileno(file); + assert(fd); + + tmp = malloc(size); + if (!tmp) { + ksft_test_result_fail("malloc() failed\n"); + goto close_file; + } + + /* Skip on errors, as we might just lack kernel support. */ + ret = io_uring_queue_init(1, &ring, 0); + if (ret < 0) { + ksft_test_result_skip("io_uring_queue_init() failed\n"); + goto free_tmp; + } + + /* + * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN + * | FOLL_LONGTERM the range. + * + * Skip on errors, as we might just lack kernel support or might not + * have sufficient MEMLOCK permissions. + */ + iov.iov_base = mem; + iov.iov_len = size; + ret = io_uring_register_buffers(&ring, &iov, 1); + if (ret) { + ksft_test_result_skip("io_uring_register_buffers() failed\n"); + goto queue_exit; + } + + if (use_fork) { + /* + * fork() and keep the child alive until we're done. Note that + * we expect the pinned page to not get shared with the child. + */ + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto unregister_buffers; + } else if (!ret) { + write(comm_pipes.child_ready[1], "0", 1); + while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) + ; + exit(0); + } + + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + } else { + /* + * Map the page R/O into the page table. Enable softdirty + * tracking to stop the page from getting mapped R/W immediately + * again by mprotect() optimizations. Note that we don't have an + * easy way to test if that worked (the pagemap does not export + * if the page is mapped R/O vs. R/W). + */ + ret = mprotect(mem, size, PROT_READ); + clear_softdirty(); + ret |= mprotect(mem, size, PROT_READ | PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto unregister_buffers; + } + } + + /* + * Modify the page and write page content as observed by the fixed + * buffer pin to the file so we can verify it. + */ + memset(mem, 0xff, size); + sqe = io_uring_get_sqe(&ring); + if (!sqe) { + ksft_test_result_fail("io_uring_get_sqe() failed\n"); + goto quit_child; + } + io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0); + + ret = io_uring_submit(&ring); + if (ret < 0) { + ksft_test_result_fail("io_uring_submit() failed\n"); + goto quit_child; + } + + ret = io_uring_wait_cqe(&ring, &cqe); + if (ret < 0) { + ksft_test_result_fail("io_uring_wait_cqe() failed\n"); + goto quit_child; + } + + if (cqe->res != size) { + ksft_test_result_fail("write_fixed failed\n"); + goto quit_child; + } + io_uring_cqe_seen(&ring, cqe); + + /* Read back the file content to the temporary buffer. */ + total = 0; + while (total < size) { + cur = pread(fd, tmp + total, size - total, total); + if (cur < 0) { + ksft_test_result_fail("pread() failed\n"); + goto quit_child; + } + total += cur; + } + + /* Finally, check if we read what we expected. */ + ksft_test_result(!memcmp(mem, tmp, size), + "Longterm R/W pin is reliable\n"); + +quit_child: + if (use_fork) { + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + } +unregister_buffers: + io_uring_unregister_buffers(&ring); +queue_exit: + io_uring_queue_exit(&ring); +free_tmp: + free(tmp); +close_file: + fclose(file); +close_comm_pipes: + close_comm_pipes(&comm_pipes); +} + +static void test_iouring_ro(char *mem, size_t size) +{ + do_test_iouring(mem, size, false); +} + +static void test_iouring_fork(char *mem, size_t size) +{ + do_test_iouring(mem, size, true); +} + +#endif /* LOCAL_CONFIG_HAVE_LIBURING */ + +enum ro_pin_test { + RO_PIN_TEST, + RO_PIN_TEST_SHARED, + RO_PIN_TEST_PREVIOUSLY_SHARED, + RO_PIN_TEST_RO_EXCLUSIVE, +}; + +static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test, + bool fast) +{ + struct pin_longterm_test args; + struct comm_pipes comm_pipes; + char *tmp, buf; + __u64 tmp_val; + int ret; + + if (gup_fd < 0) { + ksft_test_result_skip("gup_test not available\n"); + return; + } + + tmp = malloc(size); + if (!tmp) { + ksft_test_result_fail("malloc() failed\n"); + return; + } + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + goto free_tmp; + } + + switch (test) { + case RO_PIN_TEST: + break; + case RO_PIN_TEST_SHARED: + case RO_PIN_TEST_PREVIOUSLY_SHARED: + /* + * Share the pages with our child. As the pages are not pinned, + * this should just work. + */ + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto close_comm_pipes; + } else if (!ret) { + write(comm_pipes.child_ready[1], "0", 1); + while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) + ; + exit(0); + } + + /* Wait until our child is ready. */ + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + + if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) { + /* + * Tell the child to quit now and wait until it quit. + * The pages should now be mapped R/O into our page + * tables, but they are no longer shared. + */ + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + if (!WIFEXITED(ret)) + ksft_print_msg("[INFO] wait() failed\n"); + } + break; + case RO_PIN_TEST_RO_EXCLUSIVE: + /* + * Map the page R/O into the page table. Enable softdirty + * tracking to stop the page from getting mapped R/W immediately + * again by mprotect() optimizations. Note that we don't have an + * easy way to test if that worked (the pagemap does not export + * if the page is mapped R/O vs. R/W). + */ + ret = mprotect(mem, size, PROT_READ); + clear_softdirty(); + ret |= mprotect(mem, size, PROT_READ | PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto close_comm_pipes; + } + break; + default: + assert(false); + } + + /* Take a R/O pin. This should trigger unsharing. */ + args.addr = (__u64)(uintptr_t)mem; + args.size = size; + args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0; + ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args); + if (ret) { + if (errno == EINVAL) + ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n"); + else + ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n"); + goto wait; + } + + /* Modify the page. */ + memset(mem, 0xff, size); + + /* + * Read back the content via the pin to the temporary buffer and + * test if we observed the modification. + */ + tmp_val = (__u64)(uintptr_t)tmp; + ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val); + if (ret) + ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n"); + else + ksft_test_result(!memcmp(mem, tmp, size), + "Longterm R/O pin is reliable\n"); + + ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP); + if (ret) + ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n"); +wait: + switch (test) { + case RO_PIN_TEST_SHARED: + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + if (!WIFEXITED(ret)) + ksft_print_msg("[INFO] wait() failed\n"); + break; + default: + break; + } +close_comm_pipes: + close_comm_pipes(&comm_pipes); +free_tmp: + free(tmp); +} + +static void test_ro_pin_on_shared(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false); +} + +static void test_ro_fast_pin_on_shared(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true); +} + +static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false); +} + +static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true); +} + +static void test_ro_pin_on_ro_exclusive(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false); +} + +static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true); +} + +typedef void (*test_fn)(char *mem, size_t size); + +static void do_run_with_base_page(test_fn fn, bool swapout) +{ + char *mem; + int ret; + + mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + + ret = madvise(mem, pagesize, MADV_NOHUGEPAGE); + /* Ignore if not around on a kernel. */ + if (ret && errno != EINVAL) { + ksft_test_result_fail("MADV_NOHUGEPAGE failed\n"); + goto munmap; + } + + /* Populate a base page. */ + memset(mem, 0, pagesize); + + if (swapout) { + madvise(mem, pagesize, MADV_PAGEOUT); + if (!pagemap_is_swapped(pagemap_fd, mem)) { + ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); + goto munmap; + } + } + + fn(mem, pagesize); +munmap: + munmap(mem, pagesize); +} + +static void run_with_base_page(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with base page\n", desc); + do_run_with_base_page(fn, false); +} + +static void run_with_base_page_swap(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc); + do_run_with_base_page(fn, true); +} + +enum thp_run { + THP_RUN_PMD, + THP_RUN_PMD_SWAPOUT, + THP_RUN_PTE, + THP_RUN_PTE_SWAPOUT, + THP_RUN_SINGLE_PTE, + THP_RUN_SINGLE_PTE_SWAPOUT, + THP_RUN_PARTIAL_MREMAP, + THP_RUN_PARTIAL_SHARED, +}; + +static void do_run_with_thp(test_fn fn, enum thp_run thp_run) +{ + char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED; + size_t size, mmap_size, mremap_size; + int ret; + + /* For alignment purposes, we need twice the thp size. */ + mmap_size = 2 * thpsize; + mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mmap_mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + + /* We need a THP-aligned memory area. */ + mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1)); + + ret = madvise(mem, thpsize, MADV_HUGEPAGE); + if (ret) { + ksft_test_result_fail("MADV_HUGEPAGE failed\n"); + goto munmap; + } + + /* + * Try to populate a THP. Touch the first sub-page and test if we get + * another sub-page populated automatically. + */ + mem[0] = 0; + if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) { + ksft_test_result_skip("Did not get a THP populated\n"); + goto munmap; + } + memset(mem, 0, thpsize); + + size = thpsize; + switch (thp_run) { + case THP_RUN_PMD: + case THP_RUN_PMD_SWAPOUT: + break; + case THP_RUN_PTE: + case THP_RUN_PTE_SWAPOUT: + /* + * Trigger PTE-mapping the THP by temporarily mapping a single + * subpage R/O. + */ + ret = mprotect(mem + pagesize, pagesize, PROT_READ); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto munmap; + } + break; + case THP_RUN_SINGLE_PTE: + case THP_RUN_SINGLE_PTE_SWAPOUT: + /* + * Discard all but a single subpage of that PTE-mapped THP. What + * remains is a single PTE mapping a single subpage. + */ + ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED); + if (ret) { + ksft_test_result_fail("MADV_DONTNEED failed\n"); + goto munmap; + } + size = pagesize; + break; + case THP_RUN_PARTIAL_MREMAP: + /* + * Remap half of the THP. We need some new memory location + * for that. + */ + mremap_size = thpsize / 2; + mremap_mem = mmap(NULL, mremap_size, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + tmp = mremap(mem + mremap_size, mremap_size, mremap_size, + MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem); + if (tmp != mremap_mem) { + ksft_test_result_fail("mremap() failed\n"); + goto munmap; + } + size = mremap_size; + break; + case THP_RUN_PARTIAL_SHARED: + /* + * Share the first page of the THP with a child and quit the + * child. This will result in some parts of the THP never + * have been shared. + */ + ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK); + if (ret) { + ksft_test_result_fail("MADV_DONTFORK failed\n"); + goto munmap; + } + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto munmap; + } else if (!ret) { + exit(0); + } + wait(&ret); + /* Allow for sharing all pages again. */ + ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK); + if (ret) { + ksft_test_result_fail("MADV_DOFORK failed\n"); + goto munmap; + } + break; + default: + assert(false); + } + + switch (thp_run) { + case THP_RUN_PMD_SWAPOUT: + case THP_RUN_PTE_SWAPOUT: + case THP_RUN_SINGLE_PTE_SWAPOUT: + madvise(mem, size, MADV_PAGEOUT); + if (!range_is_swapped(mem, size)) { + ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); + goto munmap; + } + break; + default: + break; + } + + fn(mem, size); +munmap: + munmap(mmap_mem, mmap_size); + if (mremap_mem != MAP_FAILED) + munmap(mremap_mem, mremap_size); +} + +static void run_with_thp(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with THP\n", desc); + do_run_with_thp(fn, THP_RUN_PMD); +} + +static void run_with_thp_swap(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with swapped-out THP\n", desc); + do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT); +} + +static void run_with_pte_mapped_thp(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with PTE-mapped THP\n", desc); + do_run_with_thp(fn, THP_RUN_PTE); +} + +static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP\n", desc); + do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT); +} + +static void run_with_single_pte_of_thp(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with single PTE of THP\n", desc); + do_run_with_thp(fn, THP_RUN_SINGLE_PTE); +} + +static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP\n", desc); + do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT); +} + +static void run_with_partial_mremap_thp(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP\n", desc); + do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP); +} + +static void run_with_partial_shared_thp(test_fn fn, const char *desc) +{ + ksft_print_msg("[RUN] %s ... with partially shared THP\n", desc); + do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED); +} + +static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize) +{ + int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB; + char *mem, *dummy; + + ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc, + hugetlbsize / 1024); + + flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT; + + mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_skip("need more free huge pages\n"); + return; + } + + /* Populate an huge page. */ + memset(mem, 0, hugetlbsize); + + /* + * We need a total of two hugetlb pages to handle COW/unsharing + * properly, otherwise we might get zapped by a SIGBUS. + */ + dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0); + if (dummy == MAP_FAILED) { + ksft_test_result_skip("need more free huge pages\n"); + goto munmap; + } + munmap(dummy, hugetlbsize); + + fn(mem, hugetlbsize); +munmap: + munmap(mem, hugetlbsize); +} + +struct test_case { + const char *desc; + test_fn fn; +}; + +/* + * Test cases that are specific to anonymous pages: pages in private mappings + * that may get shared via COW during fork(). + */ +static const struct test_case anon_test_cases[] = { + /* + * Basic COW tests for fork() without any GUP. If we miss to break COW, + * either the child can observe modifications by the parent or the + * other way around. + */ + { + "Basic COW after fork()", + test_cow_in_parent, + }, + /* + * Basic test, but do an additional mprotect(PROT_READ)+ + * mprotect(PROT_READ|PROT_WRITE) in the parent before write access. + */ + { + "Basic COW after fork() with mprotect() optimization", + test_cow_in_parent_mprotect, + }, + /* + * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If + * we miss to break COW, the child observes modifications by the parent. + * This is CVE-2020-29374 reported by Jann Horn. + */ + { + "vmsplice() + unmap in child", + test_vmsplice_in_child + }, + /* + * vmsplice() test, but do an additional mprotect(PROT_READ)+ + * mprotect(PROT_READ|PROT_WRITE) in the parent before write access. + */ + { + "vmsplice() + unmap in child with mprotect() optimization", + test_vmsplice_in_child_mprotect + }, + /* + * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after + * fork(); modify in the child. If we miss to break COW, the parent + * observes modifications by the child. + */ + { + "vmsplice() before fork(), unmap in parent after fork()", + test_vmsplice_before_fork, + }, + /* + * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the + * child. If we miss to break COW, the parent observes modifications by + * the child. + */ + { + "vmsplice() + unmap in parent after fork()", + test_vmsplice_after_fork, + }, +#ifdef LOCAL_CONFIG_HAVE_LIBURING + /* + * Take a R/W longterm pin and then map the page R/O into the page + * table to trigger a write fault on next access. When modifying the + * page, the page content must be visible via the pin. + */ + { + "R/O-mapping a page registered as iouring fixed buffer", + test_iouring_ro, + }, + /* + * Take a R/W longterm pin and then fork() a child. When modifying the + * page, the page content must be visible via the pin. We expect the + * pinned page to not get shared with the child. + */ + { + "fork() with an iouring fixed buffer", + test_iouring_fork, + }, + +#endif /* LOCAL_CONFIG_HAVE_LIBURING */ + /* + * Take a R/O longterm pin on a R/O-mapped shared anonymous page. + * When modifying the page via the page table, the page content change + * must be visible via the pin. + */ + { + "R/O GUP pin on R/O-mapped shared page", + test_ro_pin_on_shared, + }, + /* Same as above, but using GUP-fast. */ + { + "R/O GUP-fast pin on R/O-mapped shared page", + test_ro_fast_pin_on_shared, + }, + /* + * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that + * was previously shared. When modifying the page via the page table, + * the page content change must be visible via the pin. + */ + { + "R/O GUP pin on R/O-mapped previously-shared page", + test_ro_pin_on_ro_previously_shared, + }, + /* Same as above, but using GUP-fast. */ + { + "R/O GUP-fast pin on R/O-mapped previously-shared page", + test_ro_fast_pin_on_ro_previously_shared, + }, + /* + * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page. + * When modifying the page via the page table, the page content change + * must be visible via the pin. + */ + { + "R/O GUP pin on R/O-mapped exclusive page", + test_ro_pin_on_ro_exclusive, + }, + /* Same as above, but using GUP-fast. */ + { + "R/O GUP-fast pin on R/O-mapped exclusive page", + test_ro_fast_pin_on_ro_exclusive, + }, +}; + +static void run_anon_test_case(struct test_case const *test_case) +{ + int i; + + run_with_base_page(test_case->fn, test_case->desc); + run_with_base_page_swap(test_case->fn, test_case->desc); + if (thpsize) { + run_with_thp(test_case->fn, test_case->desc); + run_with_thp_swap(test_case->fn, test_case->desc); + run_with_pte_mapped_thp(test_case->fn, test_case->desc); + run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc); + run_with_single_pte_of_thp(test_case->fn, test_case->desc); + run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc); + run_with_partial_mremap_thp(test_case->fn, test_case->desc); + run_with_partial_shared_thp(test_case->fn, test_case->desc); + } + for (i = 0; i < nr_hugetlbsizes; i++) + run_with_hugetlb(test_case->fn, test_case->desc, + hugetlbsizes[i]); +} + +static void run_anon_test_cases(void) +{ + int i; + + ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n"); + + for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++) + run_anon_test_case(&anon_test_cases[i]); +} + +static int tests_per_anon_test_case(void) +{ + int tests = 2 + nr_hugetlbsizes; + + if (thpsize) + tests += 8; + return tests; +} + +enum anon_thp_collapse_test { + ANON_THP_COLLAPSE_UNSHARED, + ANON_THP_COLLAPSE_FULLY_SHARED, + ANON_THP_COLLAPSE_LOWER_SHARED, + ANON_THP_COLLAPSE_UPPER_SHARED, +}; + +static void do_test_anon_thp_collapse(char *mem, size_t size, + enum anon_thp_collapse_test test) +{ + struct comm_pipes comm_pipes; + char buf; + int ret; + + ret = setup_comm_pipes(&comm_pipes); + if (ret) { + ksft_test_result_fail("pipe() failed\n"); + return; + } + + /* + * Trigger PTE-mapping the THP by temporarily mapping a single subpage + * R/O, such that we can try collapsing it later. + */ + ret = mprotect(mem + pagesize, pagesize, PROT_READ); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto close_comm_pipes; + } + ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); + if (ret) { + ksft_test_result_fail("mprotect() failed\n"); + goto close_comm_pipes; + } + + switch (test) { + case ANON_THP_COLLAPSE_UNSHARED: + /* Collapse before actually COW-sharing the page. */ + ret = madvise(mem, size, MADV_COLLAPSE); + if (ret) { + ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", + strerror(errno)); + goto close_comm_pipes; + } + break; + case ANON_THP_COLLAPSE_FULLY_SHARED: + /* COW-share the full PTE-mapped THP. */ + break; + case ANON_THP_COLLAPSE_LOWER_SHARED: + /* Don't COW-share the upper part of the THP. */ + ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK); + if (ret) { + ksft_test_result_fail("MADV_DONTFORK failed\n"); + goto close_comm_pipes; + } + break; + case ANON_THP_COLLAPSE_UPPER_SHARED: + /* Don't COW-share the lower part of the THP. */ + ret = madvise(mem, size / 2, MADV_DONTFORK); + if (ret) { + ksft_test_result_fail("MADV_DONTFORK failed\n"); + goto close_comm_pipes; + } + break; + default: + assert(false); + } + + ret = fork(); + if (ret < 0) { + ksft_test_result_fail("fork() failed\n"); + goto close_comm_pipes; + } else if (!ret) { + switch (test) { + case ANON_THP_COLLAPSE_UNSHARED: + case ANON_THP_COLLAPSE_FULLY_SHARED: + exit(child_memcmp_fn(mem, size, &comm_pipes)); + break; + case ANON_THP_COLLAPSE_LOWER_SHARED: + exit(child_memcmp_fn(mem, size / 2, &comm_pipes)); + break; + case ANON_THP_COLLAPSE_UPPER_SHARED: + exit(child_memcmp_fn(mem + size / 2, size / 2, + &comm_pipes)); + break; + default: + assert(false); + } + } + + while (read(comm_pipes.child_ready[0], &buf, 1) != 1) + ; + + switch (test) { + case ANON_THP_COLLAPSE_UNSHARED: + break; + case ANON_THP_COLLAPSE_UPPER_SHARED: + case ANON_THP_COLLAPSE_LOWER_SHARED: + /* + * Revert MADV_DONTFORK such that we merge the VMAs and are + * able to actually collapse. + */ + ret = madvise(mem, size, MADV_DOFORK); + if (ret) { + ksft_test_result_fail("MADV_DOFORK failed\n"); + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + goto close_comm_pipes; + } + /* FALLTHROUGH */ + case ANON_THP_COLLAPSE_FULLY_SHARED: + /* Collapse before anyone modified the COW-shared page. */ + ret = madvise(mem, size, MADV_COLLAPSE); + if (ret) { + ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", + strerror(errno)); + write(comm_pipes.parent_ready[1], "0", 1); + wait(&ret); + goto close_comm_pipes; + } + break; + default: + assert(false); + } + + /* Modify the page. */ + memset(mem, 0xff, size); + write(comm_pipes.parent_ready[1], "0", 1); + + wait(&ret); + if (WIFEXITED(ret)) + ret = WEXITSTATUS(ret); + else + ret = -EINVAL; + + ksft_test_result(!ret, "No leak from parent into child\n"); +close_comm_pipes: + close_comm_pipes(&comm_pipes); +} + +static void test_anon_thp_collapse_unshared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED); +} + +static void test_anon_thp_collapse_fully_shared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED); +} + +static void test_anon_thp_collapse_lower_shared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED); +} + +static void test_anon_thp_collapse_upper_shared(char *mem, size_t size) +{ + do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED); +} + +/* + * Test cases that are specific to anonymous THP: pages in private mappings + * that may get shared via COW during fork(). + */ +static const struct test_case anon_thp_test_cases[] = { + /* + * Basic COW test for fork() without any GUP when collapsing a THP + * before fork(). + * + * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place + * collapse") might easily get COW handling wrong when not collapsing + * exclusivity information properly. + */ + { + "Basic COW after fork() when collapsing before fork()", + test_anon_thp_collapse_unshared, + }, + /* Basic COW test, but collapse after COW-sharing a full THP. */ + { + "Basic COW after fork() when collapsing after fork() (fully shared)", + test_anon_thp_collapse_fully_shared, + }, + /* + * Basic COW test, but collapse after COW-sharing the lower half of a + * THP. + */ + { + "Basic COW after fork() when collapsing after fork() (lower shared)", + test_anon_thp_collapse_lower_shared, + }, + /* + * Basic COW test, but collapse after COW-sharing the upper half of a + * THP. + */ + { + "Basic COW after fork() when collapsing after fork() (upper shared)", + test_anon_thp_collapse_upper_shared, + }, +}; + +static void run_anon_thp_test_cases(void) +{ + int i; + + if (!thpsize) + return; + + ksft_print_msg("[INFO] Anonymous THP tests\n"); + + for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) { + struct test_case const *test_case = &anon_thp_test_cases[i]; + + ksft_print_msg("[RUN] %s\n", test_case->desc); + do_run_with_thp(test_case->fn, THP_RUN_PMD); + } +} + +static int tests_per_anon_thp_test_case(void) +{ + return thpsize ? 1 : 0; +} + +typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size); + +static void test_cow(char *mem, const char *smem, size_t size) +{ + char *old = malloc(size); + + /* Backup the original content. */ + memcpy(old, smem, size); + + /* Modify the page. */ + memset(mem, 0xff, size); + + /* See if we still read the old values via the other mapping. */ + ksft_test_result(!memcmp(smem, old, size), + "Other mapping not modified\n"); + free(old); +} + +static void test_ro_pin(char *mem, const char *smem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST, false); +} + +static void test_ro_fast_pin(char *mem, const char *smem, size_t size) +{ + do_test_ro_pin(mem, size, RO_PIN_TEST, true); +} + +static void run_with_zeropage(non_anon_test_fn fn, const char *desc) +{ + char *mem, *smem, tmp; + + ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc); + + mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + + smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* Read from the page to populate the shared zeropage. */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + + fn(mem, smem, pagesize); +munmap: + munmap(mem, pagesize); + if (smem != MAP_FAILED) + munmap(smem, pagesize); +} + +static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) +{ + char *mem, *smem, *mmap_mem, *mmap_smem, tmp; + size_t mmap_size; + int ret; + + ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc); + + if (!has_huge_zeropage) { + ksft_test_result_skip("Huge zeropage not enabled\n"); + return; + } + + /* For alignment purposes, we need twice the thp size. */ + mmap_size = 2 * thpsize; + mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mmap_mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return; + } + mmap_smem = mmap(NULL, mmap_size, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mmap_smem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* We need a THP-aligned memory area. */ + mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1)); + smem = (char *)(((uintptr_t)mmap_smem + thpsize) & ~(thpsize - 1)); + + ret = madvise(mem, thpsize, MADV_HUGEPAGE); + ret |= madvise(smem, thpsize, MADV_HUGEPAGE); + if (ret) { + ksft_test_result_fail("MADV_HUGEPAGE failed\n"); + goto munmap; + } + + /* + * Read from the memory to populate the huge shared zeropage. Read from + * the first sub-page and test if we get another sub-page populated + * automatically. + */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + if (!pagemap_is_populated(pagemap_fd, mem + pagesize) || + !pagemap_is_populated(pagemap_fd, smem + pagesize)) { + ksft_test_result_skip("Did not get THPs populated\n"); + goto munmap; + } + + fn(mem, smem, thpsize); +munmap: + munmap(mmap_mem, mmap_size); + if (mmap_smem != MAP_FAILED) + munmap(mmap_smem, mmap_size); +} + +static void run_with_memfd(non_anon_test_fn fn, const char *desc) +{ + char *mem, *smem, tmp; + int fd; + + ksft_print_msg("[RUN] %s ... with memfd\n", desc); + + fd = memfd_create("test", 0); + if (fd < 0) { + ksft_test_result_fail("memfd_create() failed\n"); + return; + } + + /* File consists of a single page filled with zeroes. */ + if (fallocate(fd, 0, 0, pagesize)) { + ksft_test_result_fail("fallocate() failed\n"); + goto close; + } + + /* Create a private mapping of the memfd. */ + mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto close; + } + smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* Fault the page in. */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + + fn(mem, smem, pagesize); +munmap: + munmap(mem, pagesize); + if (smem != MAP_FAILED) + munmap(smem, pagesize); +close: + close(fd); +} + +static void run_with_tmpfile(non_anon_test_fn fn, const char *desc) +{ + char *mem, *smem, tmp; + FILE *file; + int fd; + + ksft_print_msg("[RUN] %s ... with tmpfile\n", desc); + + file = tmpfile(); + if (!file) { + ksft_test_result_fail("tmpfile() failed\n"); + return; + } + + fd = fileno(file); + if (fd < 0) { + ksft_test_result_skip("fileno() failed\n"); + return; + } + + /* File consists of a single page filled with zeroes. */ + if (fallocate(fd, 0, 0, pagesize)) { + ksft_test_result_fail("fallocate() failed\n"); + goto close; + } + + /* Create a private mapping of the memfd. */ + mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto close; + } + smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* Fault the page in. */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + + fn(mem, smem, pagesize); +munmap: + munmap(mem, pagesize); + if (smem != MAP_FAILED) + munmap(smem, pagesize); +close: + fclose(file); +} + +static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc, + size_t hugetlbsize) +{ + int flags = MFD_HUGETLB; + char *mem, *smem, tmp; + int fd; + + ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc, + hugetlbsize / 1024); + + flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT; + + fd = memfd_create("test", flags); + if (fd < 0) { + ksft_test_result_skip("memfd_create() failed\n"); + return; + } + + /* File consists of a single page filled with zeroes. */ + if (fallocate(fd, 0, 0, hugetlbsize)) { + ksft_test_result_skip("need more free huge pages\n"); + goto close; + } + + /* Create a private mapping of the memfd. */ + mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, + 0); + if (mem == MAP_FAILED) { + ksft_test_result_skip("need more free huge pages\n"); + goto close; + } + smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0); + if (mem == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + goto munmap; + } + + /* Fault the page in. */ + tmp = *mem + *smem; + asm volatile("" : "+r" (tmp)); + + fn(mem, smem, hugetlbsize); +munmap: + munmap(mem, hugetlbsize); + if (mem != MAP_FAILED) + munmap(smem, hugetlbsize); +close: + close(fd); +} + +struct non_anon_test_case { + const char *desc; + non_anon_test_fn fn; +}; + +/* + * Test cases that target any pages in private mappings that are not anonymous: + * pages that may get shared via COW ndependent of fork(). This includes + * the shared zeropage(s), pagecache pages, ... + */ +static const struct non_anon_test_case non_anon_test_cases[] = { + /* + * Basic COW test without any GUP. If we miss to break COW, changes are + * visible via other private/shared mappings. + */ + { + "Basic COW", + test_cow, + }, + /* + * Take a R/O longterm pin. When modifying the page via the page table, + * the page content change must be visible via the pin. + */ + { + "R/O longterm GUP pin", + test_ro_pin, + }, + /* Same as above, but using GUP-fast. */ + { + "R/O longterm GUP-fast pin", + test_ro_fast_pin, + }, +}; + +static void run_non_anon_test_case(struct non_anon_test_case const *test_case) +{ + int i; + + run_with_zeropage(test_case->fn, test_case->desc); + run_with_memfd(test_case->fn, test_case->desc); + run_with_tmpfile(test_case->fn, test_case->desc); + if (thpsize) + run_with_huge_zeropage(test_case->fn, test_case->desc); + for (i = 0; i < nr_hugetlbsizes; i++) + run_with_memfd_hugetlb(test_case->fn, test_case->desc, + hugetlbsizes[i]); +} + +static void run_non_anon_test_cases(void) +{ + int i; + + ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n"); + + for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++) + run_non_anon_test_case(&non_anon_test_cases[i]); +} + +static int tests_per_non_anon_test_case(void) +{ + int tests = 3 + nr_hugetlbsizes; + + if (thpsize) + tests += 1; + return tests; +} + +int main(int argc, char **argv) +{ + int err; + + pagesize = getpagesize(); + detect_thpsize(); + detect_hugetlbsizes(); + detect_huge_zeropage(); + + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() + + ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() + + ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case()); + + gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + + run_anon_test_cases(); + run_anon_thp_test_cases(); + run_non_anon_test_cases(); + + err = ksft_get_fail_cnt(); + if (err) + ksft_exit_fail_msg("%d out of %d tests failed\n", + err, ksft_test_num()); + return ksft_exit_pass(); +} --- /dev/null +++ a/tools/testing/selftests/mm/.gitignore @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: GPL-2.0-only +cow +hugepage-mmap +hugepage-mremap +hugepage-shm +hugepage-vmemmap +hugetlb-madvise +khugepaged +map_hugetlb +map_populate +thuge-gen +compaction_test +migration +mlock2-tests +mrelease_test +mremap_dontunmap +mremap_test +on-fault-limit +transhuge-stress +protection_keys +protection_keys_32 +protection_keys_64 +madv_populate +userfaultfd +mlock-intersect-test +mlock-random-test +virtual_address_range +gup_test +va_128TBswitch +map_fixed_noreplace +write_to_hugetlbfs +hmm-tests +memfd_secret +soft-dirty +split_huge_page_test +ksm_tests +local_config.h +local_config.mk --- /dev/null +++ a/tools/testing/selftests/mm/gup_test.c @@ -0,0 +1,271 @@ +#include <fcntl.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <dirent.h> +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <pthread.h> +#include <assert.h> +#include <mm/gup_test.h> +#include "../kselftest.h" + +#include "util.h" + +#define MB (1UL << 20) + +/* Just the flags we need, copied from mm.h: */ +#define FOLL_WRITE 0x01 /* check pte is writable */ +#define FOLL_TOUCH 0x02 /* mark page accessed */ + +#define GUP_TEST_FILE "/sys/kernel/debug/gup_test" + +static unsigned long cmd = GUP_FAST_BENCHMARK; +static int gup_fd, repeats = 1; +static unsigned long size = 128 * MB; +/* Serialize prints */ +static pthread_mutex_t print_mutex = PTHREAD_MUTEX_INITIALIZER; + +static char *cmd_to_str(unsigned long cmd) +{ + switch (cmd) { + case GUP_FAST_BENCHMARK: + return "GUP_FAST_BENCHMARK"; + case PIN_FAST_BENCHMARK: + return "PIN_FAST_BENCHMARK"; + case PIN_LONGTERM_BENCHMARK: + return "PIN_LONGTERM_BENCHMARK"; + case GUP_BASIC_TEST: + return "GUP_BASIC_TEST"; + case PIN_BASIC_TEST: + return "PIN_BASIC_TEST"; + case DUMP_USER_PAGES_TEST: + return "DUMP_USER_PAGES_TEST"; + } + return "Unknown command"; +} + +void *gup_thread(void *data) +{ + struct gup_test gup = *(struct gup_test *)data; + int i; + + /* Only report timing information on the *_BENCHMARK commands: */ + if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) || + (cmd == PIN_LONGTERM_BENCHMARK)) { + for (i = 0; i < repeats; i++) { + gup.size = size; + if (ioctl(gup_fd, cmd, &gup)) + perror("ioctl"), exit(1); + + pthread_mutex_lock(&print_mutex); + printf("%s: Time: get:%lld put:%lld us", + cmd_to_str(cmd), gup.get_delta_usec, + gup.put_delta_usec); + if (gup.size != size) + printf(", truncated (size: %lld)", gup.size); + printf("\n"); + pthread_mutex_unlock(&print_mutex); + } + } else { + gup.size = size; + if (ioctl(gup_fd, cmd, &gup)) { + perror("ioctl"); + exit(1); + } + + pthread_mutex_lock(&print_mutex); + printf("%s: done\n", cmd_to_str(cmd)); + if (gup.size != size) + printf("Truncated (size: %lld)\n", gup.size); + pthread_mutex_unlock(&print_mutex); + } + + return NULL; +} + +int main(int argc, char **argv) +{ + struct gup_test gup = { 0 }; + int filed, i, opt, nr_pages = 1, thp = -1, write = 1, nthreads = 1, ret; + int flags = MAP_PRIVATE, touch = 0; + char *file = "/dev/zero"; + pthread_t *tid; + char *p; + + while ((opt = getopt(argc, argv, "m:r:n:F:f:abcj:tTLUuwWSHpz")) != -1) { + switch (opt) { + case 'a': + cmd = PIN_FAST_BENCHMARK; + break; + case 'b': + cmd = PIN_BASIC_TEST; + break; + case 'L': + cmd = PIN_LONGTERM_BENCHMARK; + break; + case 'c': + cmd = DUMP_USER_PAGES_TEST; + /* + * Dump page 0 (index 1). May be overridden later, by + * user's non-option arguments. + * + * .which_pages is zero-based, so that zero can mean "do + * nothing". + */ + gup.which_pages[0] = 1; + break; + case 'p': + /* works only with DUMP_USER_PAGES_TEST */ + gup.test_flags |= GUP_TEST_FLAG_DUMP_PAGES_USE_PIN; + break; + case 'F': + /* strtol, so you can pass flags in hex form */ + gup.gup_flags = strtol(optarg, 0, 0); + break; + case 'j': + nthreads = atoi(optarg); + break; + case 'm': + size = atoi(optarg) * MB; + break; + case 'r': + repeats = atoi(optarg); + break; + case 'n': + nr_pages = atoi(optarg); + break; + case 't': + thp = 1; + break; + case 'T': + thp = 0; + break; + case 'U': + cmd = GUP_BASIC_TEST; + break; + case 'u': + cmd = GUP_FAST_BENCHMARK; + break; + case 'w': + write = 1; + break; + case 'W': + write = 0; + break; + case 'f': + file = optarg; + break; + case 'S': + flags &= ~MAP_PRIVATE; + flags |= MAP_SHARED; + break; + case 'H': + flags |= (MAP_HUGETLB | MAP_ANONYMOUS); + break; + case 'z': + /* fault pages in gup, do not fault in userland */ + touch = 1; + break; + default: + return -1; + } + } + + if (optind < argc) { + int extra_arg_count = 0; + /* + * For example: + * + * ./gup_test -c 0 1 0x1001 + * + * ...to dump pages 0, 1, and 4097 + */ + + while ((optind < argc) && + (extra_arg_count < GUP_TEST_MAX_PAGES_TO_DUMP)) { + /* + * Do the 1-based indexing here, so that the user can + * use normal 0-based indexing on the command line. + */ + long page_index = strtol(argv[optind], 0, 0) + 1; + + gup.which_pages[extra_arg_count] = page_index; + extra_arg_count++; + optind++; + } + } + + filed = open(file, O_RDWR|O_CREAT); + if (filed < 0) { + perror("open"); + exit(filed); + } + + gup.nr_pages_per_call = nr_pages; + if (write) + gup.gup_flags |= FOLL_WRITE; + + gup_fd = open(GUP_TEST_FILE, O_RDWR); + if (gup_fd == -1) { + switch (errno) { + case EACCES: + if (getuid()) + printf("Please run this test as root\n"); + break; + case ENOENT: + if (opendir("/sys/kernel/debug") == NULL) { + printf("mount debugfs at /sys/kernel/debug\n"); + break; + } + printf("check if CONFIG_GUP_TEST is enabled in kernel config\n"); + break; + default: + perror("failed to open " GUP_TEST_FILE); + break; + } + exit(KSFT_SKIP); + } + + p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0); + if (p == MAP_FAILED) { + perror("mmap"); + exit(1); + } + gup.addr = (unsigned long)p; + + if (thp == 1) + madvise(p, size, MADV_HUGEPAGE); + else if (thp == 0) + madvise(p, size, MADV_NOHUGEPAGE); + + /* + * FOLL_TOUCH, in gup_test, is used as an either/or case: either + * fault pages in from the kernel via FOLL_TOUCH, or fault them + * in here, from user space. This allows comparison of performance + * between those two cases. + */ + if (touch) { + gup.gup_flags |= FOLL_TOUCH; + } else { + for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE) + p[0] = 0; + } + + tid = malloc(sizeof(pthread_t) * nthreads); + assert(tid); + for (i = 0; i < nthreads; i++) { + ret = pthread_create(&tid[i], NULL, gup_thread, &gup); + assert(ret == 0); + } + for (i = 0; i < nthreads; i++) { + ret = pthread_join(tid[i], NULL); + assert(ret == 0); + } + free(tid); + + return 0; +} --- /dev/null +++ a/tools/testing/selftests/mm/hmm-tests.c @@ -0,0 +1,2054 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * HMM stands for Heterogeneous Memory Management, it is a helper layer inside + * the linux kernel to help device drivers mirror a process address space in + * the device. This allows the device to use the same address space which + * makes communication and data exchange a lot easier. + * + * This framework's sole purpose is to exercise various code paths inside + * the kernel to make sure that HMM performs as expected and to flush out any + * bugs. + */ + +#include "../kselftest_harness.h" + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <unistd.h> +#include <strings.h> +#include <time.h> +#include <pthread.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <sys/ioctl.h> + + +/* + * This is a private UAPI to the kernel test module so it isn't exported + * in the usual include/uapi/... directory. + */ +#include <lib/test_hmm_uapi.h> +#include <mm/gup_test.h> + +struct hmm_buffer { + void *ptr; + void *mirror; + unsigned long size; + int fd; + uint64_t cpages; + uint64_t faults; +}; + +enum { + HMM_PRIVATE_DEVICE_ONE, + HMM_PRIVATE_DEVICE_TWO, + HMM_COHERENCE_DEVICE_ONE, + HMM_COHERENCE_DEVICE_TWO, +}; + +#define TWOMEG (1 << 21) +#define HMM_BUFFER_SIZE (1024 << 12) +#define HMM_PATH_MAX 64 +#define NTIMES 10 + +#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) +/* Just the flags we need, copied from mm.h: */ +#define FOLL_WRITE 0x01 /* check pte is writable */ +#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite */ + +FIXTURE(hmm) +{ + int fd; + unsigned int page_size; + unsigned int page_shift; +}; + +FIXTURE_VARIANT(hmm) +{ + int device_number; +}; + +FIXTURE_VARIANT_ADD(hmm, hmm_device_private) +{ + .device_number = HMM_PRIVATE_DEVICE_ONE, +}; + +FIXTURE_VARIANT_ADD(hmm, hmm_device_coherent) +{ + .device_number = HMM_COHERENCE_DEVICE_ONE, +}; + +FIXTURE(hmm2) +{ + int fd0; + int fd1; + unsigned int page_size; + unsigned int page_shift; +}; + +FIXTURE_VARIANT(hmm2) +{ + int device_number0; + int device_number1; +}; + +FIXTURE_VARIANT_ADD(hmm2, hmm2_device_private) +{ + .device_number0 = HMM_PRIVATE_DEVICE_ONE, + .device_number1 = HMM_PRIVATE_DEVICE_TWO, +}; + +FIXTURE_VARIANT_ADD(hmm2, hmm2_device_coherent) +{ + .device_number0 = HMM_COHERENCE_DEVICE_ONE, + .device_number1 = HMM_COHERENCE_DEVICE_TWO, +}; + +static int hmm_open(int unit) +{ + char pathname[HMM_PATH_MAX]; + int fd; + + snprintf(pathname, sizeof(pathname), "/dev/hmm_dmirror%d", unit); + fd = open(pathname, O_RDWR, 0); + if (fd < 0) + fprintf(stderr, "could not open hmm dmirror driver (%s)\n", + pathname); + return fd; +} + +static bool hmm_is_coherent_type(int dev_num) +{ + return (dev_num >= HMM_COHERENCE_DEVICE_ONE); +} + +FIXTURE_SETUP(hmm) +{ + self->page_size = sysconf(_SC_PAGE_SIZE); + self->page_shift = ffs(self->page_size) - 1; + + self->fd = hmm_open(variant->device_number); + if (self->fd < 0 && hmm_is_coherent_type(variant->device_number)) + SKIP(exit(0), "DEVICE_COHERENT not available"); + ASSERT_GE(self->fd, 0); +} + +FIXTURE_SETUP(hmm2) +{ + self->page_size = sysconf(_SC_PAGE_SIZE); + self->page_shift = ffs(self->page_size) - 1; + + self->fd0 = hmm_open(variant->device_number0); + if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0)) + SKIP(exit(0), "DEVICE_COHERENT not available"); + ASSERT_GE(self->fd0, 0); + self->fd1 = hmm_open(variant->device_number1); + ASSERT_GE(self->fd1, 0); +} + +FIXTURE_TEARDOWN(hmm) +{ + int ret = close(self->fd); + + ASSERT_EQ(ret, 0); + self->fd = -1; +} + +FIXTURE_TEARDOWN(hmm2) +{ + int ret = close(self->fd0); + + ASSERT_EQ(ret, 0); + self->fd0 = -1; + + ret = close(self->fd1); + ASSERT_EQ(ret, 0); + self->fd1 = -1; +} + +static int hmm_dmirror_cmd(int fd, + unsigned long request, + struct hmm_buffer *buffer, + unsigned long npages) +{ + struct hmm_dmirror_cmd cmd; + int ret; + + /* Simulate a device reading system memory. */ + cmd.addr = (__u64)buffer->ptr; + cmd.ptr = (__u64)buffer->mirror; + cmd.npages = npages; + + for (;;) { + ret = ioctl(fd, request, &cmd); + if (ret == 0) + break; + if (errno == EINTR) + continue; + return -errno; + } + buffer->cpages = cmd.cpages; + buffer->faults = cmd.faults; + + return 0; +} + +static void hmm_buffer_free(struct hmm_buffer *buffer) +{ + if (buffer == NULL) + return; + + if (buffer->ptr) + munmap(buffer->ptr, buffer->size); + free(buffer->mirror); + free(buffer); +} + +/* + * Create a temporary file that will be deleted on close. + */ +static int hmm_create_file(unsigned long size) +{ + char path[HMM_PATH_MAX]; + int fd; + + strcpy(path, "/tmp"); + fd = open(path, O_TMPFILE | O_EXCL | O_RDWR, 0600); + if (fd >= 0) { + int r; + + do { + r = ftruncate(fd, size); + } while (r == -1 && errno == EINTR); + if (!r) + return fd; + close(fd); + } + return -1; +} + +/* + * Return a random unsigned number. + */ +static unsigned int hmm_random(void) +{ + static int fd = -1; + unsigned int r; + + if (fd < 0) { + fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) { + fprintf(stderr, "%s:%d failed to open /dev/urandom\n", + __FILE__, __LINE__); + return ~0U; + } + } + read(fd, &r, sizeof(r)); + return r; +} + +static void hmm_nanosleep(unsigned int n) +{ + struct timespec t; + + t.tv_sec = 0; + t.tv_nsec = n; + nanosleep(&t, NULL); +} + +static int hmm_migrate_sys_to_dev(int fd, + struct hmm_buffer *buffer, + unsigned long npages) +{ + return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages); +} + +static int hmm_migrate_dev_to_sys(int fd, + struct hmm_buffer *buffer, + unsigned long npages) +{ + return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages); +} + +/* + * Simple NULL test of device open/close. + */ +TEST_F(hmm, open_close) +{ +} + +/* + * Read private anonymous memory. + */ +TEST_F(hmm, anon_read) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + int val; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* + * Initialize buffer in system memory but leave the first two pages + * zero (pte_none and pfn_zero). + */ + i = 2 * self->page_size / sizeof(*ptr); + for (ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Set buffer permission to read-only. */ + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + + /* Populate the CPU page table with a special zero page. */ + val = *(int *)(buffer->ptr + self->page_size); + ASSERT_EQ(val, 0); + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + ptr = buffer->mirror; + for (i = 0; i < 2 * self->page_size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + for (; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Read private anonymous memory which has been protected with + * mprotect() PROT_NONE. + */ +TEST_F(hmm, anon_read_prot) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Initialize mirror buffer so we can verify it isn't written. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = -i; + + /* Protect buffer from reading. */ + ret = mprotect(buffer->ptr, size, PROT_NONE); + ASSERT_EQ(ret, 0); + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, -EFAULT); + + /* Allow CPU to read the buffer so we can check it. */ + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + hmm_buffer_free(buffer); +} + +/* + * Write private anonymous memory. + */ +TEST_F(hmm, anon_write) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Write private anonymous memory which has been protected with + * mprotect() PROT_READ. + */ +TEST_F(hmm, anon_write_prot) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Simulate a device reading a zero page of memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, 1); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 1); + ASSERT_EQ(buffer->faults, 1); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, -EPERM); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + + /* Now allow writing and see that the zero page is replaced. */ + ret = mprotect(buffer->ptr, size, PROT_WRITE | PROT_READ); + ASSERT_EQ(ret, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Check that a device writing an anonymous private mapping + * will copy-on-write if a child process inherits the mapping. + */ +TEST_F(hmm, anon_write_child) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + pid_t pid; + int child_fd; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer->ptr so we can tell if it is written. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = -i; + + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); + if (pid != 0) { + waitpid(pid, &ret, 0); + ASSERT_EQ(WIFEXITED(ret), 1); + + /* Check that the parent's buffer did not change. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + return; + } + + /* Check that we see the parent's values. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + /* The child process needs its own mirror to its own mm. */ + child_fd = hmm_open(0); + ASSERT_GE(child_fd, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + close(child_fd); + exit(0); +} + +/* + * Check that a device writing an anonymous shared mapping + * will not copy-on-write if a child process inherits the mapping. + */ +TEST_F(hmm, anon_write_child_shared) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + pid_t pid; + int child_fd; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer->ptr so we can tell if it is written. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = -i; + + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); + if (pid != 0) { + waitpid(pid, &ret, 0); + ASSERT_EQ(WIFEXITED(ret), 1); + + /* Check that the parent's buffer did change. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + return; + } + + /* Check that we see the parent's values. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + /* The child process needs its own mirror to its own mm. */ + child_fd = hmm_open(0); + ASSERT_GE(child_fd, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + + close(child_fd); + exit(0); +} + +/* + * Write private anonymous huge page. + */ +TEST_F(hmm, anon_write_huge) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = 2 * TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + size = TWOMEG; + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Read numeric data from raw and tagged kernel status files. Used to read + * /proc and /sys data (without a tag) and from /proc/meminfo (with a tag). + */ +static long file_read_ulong(char *file, const char *tag) +{ + int fd; + char buf[2048]; + int len; + char *p, *q; + long val; + + fd = open(file, O_RDONLY); + if (fd < 0) { + /* Error opening the file */ + return -1; + } + + len = read(fd, buf, sizeof(buf)); + close(fd); + if (len < 0) { + /* Error in reading the file */ + return -1; + } + if (len == sizeof(buf)) { + /* Error file is too large */ + return -1; + } + buf[len] = '\0'; + + /* Search for a tag if provided */ + if (tag) { + p = strstr(buf, tag); + if (!p) + return -1; /* looks like the line we want isn't there */ + p += strlen(tag); + } else + p = buf; + + val = strtol(p, &q, 0); + if (*q != ' ') { + /* Error parsing the file */ + return -1; + } + + return val; +} + +/* + * Write huge TLBFS page. + */ +TEST_F(hmm, anon_write_hugetlbfs) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long default_hsize; + unsigned long i; + int *ptr; + int ret; + + default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:"); + if (default_hsize < 0 || default_hsize*1024 < default_hsize) + SKIP(return, "Huge page size could not be determined"); + default_hsize = default_hsize*1024; /* KB to B */ + + size = ALIGN(TWOMEG, default_hsize); + npages = size >> self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (buffer->ptr == MAP_FAILED) { + free(buffer); + SKIP(return, "Huge page could not be allocated"); + } + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + munmap(buffer->ptr, buffer->size); + buffer->ptr = NULL; + hmm_buffer_free(buffer); +} + +/* + * Read mmap'ed file memory. + */ +TEST_F(hmm, file_read) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + int fd; + ssize_t len; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + fd = hmm_create_file(size); + ASSERT_GE(fd, 0); + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = fd; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + /* Write initial contents of the file. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + len = pwrite(fd, buffer->mirror, size, 0); + ASSERT_EQ(len, size); + memset(buffer->mirror, 0, size); + + buffer->ptr = mmap(NULL, size, + PROT_READ, + MAP_SHARED, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Write mmap'ed file memory. + */ +TEST_F(hmm, file_write) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + int fd; + ssize_t len; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + fd = hmm_create_file(size); + ASSERT_GE(fd, 0); + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = fd; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device wrote. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Check that the device also wrote the file. */ + len = pread(fd, buffer->mirror, size, 0); + ASSERT_EQ(len, size); + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Migrate anonymous memory to device private memory. + */ +TEST_F(hmm, migrate) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Migrate anonymous memory to device private memory and fault some of it back + * to system memory, then try migrating the resulting mix of system and device + * private memory to the device. + */ +TEST_F(hmm, migrate_fault) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Fault half the pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i) + ASSERT_EQ(ptr[i], i); + + /* Migrate memory to the device again. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +TEST_F(hmm, migrate_release) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Release device memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_RELEASE, buffer, npages); + ASSERT_EQ(ret, 0); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Migrate anonymous shared memory to device private memory. + */ +TEST_F(hmm, migrate_shared) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, -ENOENT); + + hmm_buffer_free(buffer); +} + +/* + * Try to migrate various memory types to device private memory. + */ +TEST_F(hmm2, migrate_mixed) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + int *ptr; + unsigned char *p; + int ret; + int val; + + npages = 6; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + p = buffer->ptr; + + /* Migrating a protected area should be an error. */ + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages); + ASSERT_EQ(ret, -EINVAL); + + /* Punch a hole after the first page address. */ + ret = munmap(buffer->ptr + self->page_size, self->page_size); + ASSERT_EQ(ret, 0); + + /* We expect an error if the vma doesn't cover the range. */ + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 3); + ASSERT_EQ(ret, -EINVAL); + + /* Page 2 will be a read-only zero page. */ + ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 2 * self->page_size); + val = *ptr + 3; + ASSERT_EQ(val, 3); + + /* Page 3 will be read-only. */ + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 3 * self->page_size); + *ptr = val; + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + + /* Page 4-5 will be read-write. */ + ret = mprotect(buffer->ptr + 4 * self->page_size, 2 * self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 4 * self->page_size); + *ptr = val; + ptr = (int *)(buffer->ptr + 5 * self->page_size); + *ptr = val; + + /* Now try to migrate pages 2-5 to device 1. */ + buffer->ptr = p + 2 * self->page_size; + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 4); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 4); + + /* Page 5 won't be migrated to device 0 because it's on device 1. */ + buffer->ptr = p + 5 * self->page_size; + ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1); + ASSERT_EQ(ret, -ENOENT); + buffer->ptr = p; + + buffer->ptr = p; + hmm_buffer_free(buffer); +} + +/* + * Migrate anonymous memory to device memory and back to system memory + * multiple times. In case of private zone configuration, this is done + * through fault pages accessed by CPU. In case of coherent zone configuration, + * the pages from the device should be explicitly migrated back to system memory. + * The reason is Coherent device zone has coherent access by CPU, therefore + * it will not generate any page fault. + */ +TEST_F(hmm, migrate_multiple) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + unsigned long c; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + for (c = 0; c < NTIMES; c++) { + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Migrate back to system memory and check them. */ + if (hmm_is_coherent_type(variant->device_number)) { + ret = hmm_migrate_dev_to_sys(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + } + + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); + } +} + +/* + * Read anonymous memory multiple times. + */ +TEST_F(hmm, anon_read_multiple) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + unsigned long c; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + for (c = 0; c < NTIMES; c++) { + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i + c; + + /* Simulate a device reading system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, + npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i + c); + + hmm_buffer_free(buffer); + } +} + +void *unmap_buffer(void *p) +{ + struct hmm_buffer *buffer = p; + + /* Delay for a bit and then unmap buffer while it is being read. */ + hmm_nanosleep(hmm_random() % 32000); + munmap(buffer->ptr + buffer->size / 2, buffer->size / 2); + buffer->ptr = NULL; + + return NULL; +} + +/* + * Try reading anonymous memory while it is being unmapped. + */ +TEST_F(hmm, anon_teardown) +{ + unsigned long npages; + unsigned long size; + unsigned long c; + void *ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + for (c = 0; c < NTIMES; ++c) { + pthread_t thread; + struct hmm_buffer *buffer; + unsigned long i; + int *ptr; + int rc; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i + c; + + rc = pthread_create(&thread, NULL, unmap_buffer, buffer); + ASSERT_EQ(rc, 0); + + /* Simulate a device reading system memory. */ + rc = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, + npages); + if (rc == 0) { + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; + i < size / sizeof(*ptr); + ++i) + ASSERT_EQ(ptr[i], i + c); + } + + pthread_join(thread, &ret); + hmm_buffer_free(buffer); + } +} + +/* + * Test memory snapshot without faulting in pages accessed by the device. + */ +TEST_F(hmm, mixedmap) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned char *m; + int ret; + + npages = 1; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, + self->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Simulate a device snapshotting CPU pagetables. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device saw. */ + m = buffer->mirror; + ASSERT_EQ(m[0], HMM_DMIRROR_PROT_READ); + + hmm_buffer_free(buffer); +} + +/* + * Test memory snapshot without faulting in pages accessed by the device. + */ +TEST_F(hmm2, snapshot) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + int *ptr; + unsigned char *p; + unsigned char *m; + int ret; + int val; + + npages = 7; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + p = buffer->ptr; + + /* Punch a hole after the first page address. */ + ret = munmap(buffer->ptr + self->page_size, self->page_size); + ASSERT_EQ(ret, 0); + + /* Page 2 will be read-only zero page. */ + ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 2 * self->page_size); + val = *ptr + 3; + ASSERT_EQ(val, 3); + + /* Page 3 will be read-only. */ + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 3 * self->page_size); + *ptr = val; + ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, + PROT_READ); + ASSERT_EQ(ret, 0); + + /* Page 4-6 will be read-write. */ + ret = mprotect(buffer->ptr + 4 * self->page_size, 3 * self->page_size, + PROT_READ | PROT_WRITE); + ASSERT_EQ(ret, 0); + ptr = (int *)(buffer->ptr + 4 * self->page_size); + *ptr = val; + + /* Page 5 will be migrated to device 0. */ + buffer->ptr = p + 5 * self->page_size; + ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 1); + + /* Page 6 will be migrated to device 1. */ + buffer->ptr = p + 6 * self->page_size; + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 1); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, 1); + + /* Simulate a device snapshotting CPU pagetables. */ + buffer->ptr = p; + ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device saw. */ + m = buffer->mirror; + ASSERT_EQ(m[0], HMM_DMIRROR_PROT_ERROR); + ASSERT_EQ(m[1], HMM_DMIRROR_PROT_ERROR); + ASSERT_EQ(m[2], HMM_DMIRROR_PROT_ZERO | HMM_DMIRROR_PROT_READ); + ASSERT_EQ(m[3], HMM_DMIRROR_PROT_READ); + ASSERT_EQ(m[4], HMM_DMIRROR_PROT_WRITE); + if (!hmm_is_coherent_type(variant->device_number0)) { + ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL | + HMM_DMIRROR_PROT_WRITE); + ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE); + } else { + ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | + HMM_DMIRROR_PROT_WRITE); + ASSERT_EQ(m[6], HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE | + HMM_DMIRROR_PROT_WRITE); + } + + hmm_buffer_free(buffer); +} + +/* + * Test the hmm_range_fault() HMM_PFN_PMD flag for large pages that + * should be mapped by a large page table entry. + */ +TEST_F(hmm, compound) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long default_hsize; + int *ptr; + unsigned char *m; + int ret; + unsigned long i; + + /* Skip test if we can't allocate a hugetlbfs page. */ + + default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:"); + if (default_hsize < 0 || default_hsize*1024 < default_hsize) + SKIP(return, "Huge page size could not be determined"); + default_hsize = default_hsize*1024; /* KB to B */ + + size = ALIGN(TWOMEG, default_hsize); + npages = size >> self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (buffer->ptr == MAP_FAILED) { + free(buffer); + return; + } + + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + /* Initialize the pages the device will snapshot in buffer->ptr. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Simulate a device snapshotting CPU pagetables. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device saw. */ + m = buffer->mirror; + for (i = 0; i < npages; ++i) + ASSERT_EQ(m[i], HMM_DMIRROR_PROT_WRITE | + HMM_DMIRROR_PROT_PMD); + + /* Make the region read-only. */ + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + + /* Simulate a device snapshotting CPU pagetables. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device saw. */ + m = buffer->mirror; + for (i = 0; i < npages; ++i) + ASSERT_EQ(m[i], HMM_DMIRROR_PROT_READ | + HMM_DMIRROR_PROT_PMD); + + munmap(buffer->ptr, buffer->size); + buffer->ptr = NULL; + hmm_buffer_free(buffer); +} + +/* + * Test two devices reading the same memory (double mapped). + */ +TEST_F(hmm2, double_map) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = 6; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Make region read-only. */ + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + + /* Simulate device 0 reading system memory. */ + ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Simulate device 1 reading system memory. */ + ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Migrate pages to device 1 and try to read from device 0. */ + ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); + + /* Check what device 0 read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + hmm_buffer_free(buffer); +} + +/* + * Basic check of exclusive faulting. + */ +TEST_F(hmm, exclusive) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Map memory exclusively for device access. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i]++, i); + + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i+1); + + /* Check atomic access revoked */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_CHECK_EXCLUSIVE, buffer, npages); + ASSERT_EQ(ret, 0); + + hmm_buffer_free(buffer); +} + +TEST_F(hmm, exclusive_mprotect) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Map memory exclusively for device access. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + ret = mprotect(buffer->ptr, size, PROT_READ); + ASSERT_EQ(ret, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, -EPERM); + + hmm_buffer_free(buffer); +} + +/* + * Check copy-on-write works. + */ +TEST_F(hmm, exclusive_cow) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + + npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Map memory exclusively for device access. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + fork(); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i]++, i); + + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i+1); + + hmm_buffer_free(buffer); +} + +static int gup_test_exec(int gup_fd, unsigned long addr, int cmd, + int npages, int size, int flags) +{ + struct gup_test gup = { + .nr_pages_per_call = npages, + .addr = addr, + .gup_flags = FOLL_WRITE | flags, + .size = size, + }; + + if (ioctl(gup_fd, cmd, &gup)) { + perror("ioctl on error\n"); + return errno; + } + + return 0; +} + +/* + * Test get user device pages through gup_test. Setting PIN_LONGTERM flag. + * This should trigger a migration back to system memory for both, private + * and coherent type pages. + * This test makes use of gup_test module. Make sure GUP_TEST_CONFIG is added + * to your configuration before you run it. + */ +TEST_F(hmm, hmm_gup_test) +{ + struct hmm_buffer *buffer; + int gup_fd; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + unsigned char *m; + + gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); + if (gup_fd == -1) + SKIP(return, "Skipping test, could not find gup_test driver"); + + npages = 4; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr, + GUP_BASIC_TEST, 1, self->page_size, 0), 0); + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr + 1 * self->page_size, + GUP_FAST_BENCHMARK, 1, self->page_size, 0), 0); + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr + 2 * self->page_size, + PIN_FAST_BENCHMARK, 1, self->page_size, FOLL_LONGTERM), 0); + ASSERT_EQ(gup_test_exec(gup_fd, + (unsigned long)buffer->ptr + 3 * self->page_size, + PIN_LONGTERM_BENCHMARK, 1, self->page_size, 0), 0); + + /* Take snapshot to CPU pagetables */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + m = buffer->mirror; + if (hmm_is_coherent_type(variant->device_number)) { + ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[0]); + ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[1]); + } else { + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[0]); + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[1]); + } + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[2]); + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[3]); + /* + * Check again the content on the pages. Make sure there's no + * corrupted data. + */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + close(gup_fd); + hmm_buffer_free(buffer); +} + +/* + * Test copy-on-write in device pages. + * In case of writing to COW private page(s), a page fault will migrate pages + * back to system memory first. Then, these pages will be duplicated. In case + * of COW device coherent type, pages are duplicated directly from device + * memory. + */ +TEST_F(hmm, hmm_cow_in_device) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + int *ptr; + int ret; + unsigned char *m; + pid_t pid; + int status; + + npages = 4; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); + if (!pid) { + /* Child process waitd for SIGTERM from the parent. */ + while (1) { + } + perror("Should not reach this\n"); + exit(0); + } + /* Parent process writes to COW pages(s) and gets a + * new copy in system. In case of device private pages, + * this write causes a migration to system mem first. + */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Terminate child and wait */ + EXPECT_EQ(0, kill(pid, SIGTERM)); + EXPECT_EQ(pid, waitpid(pid, &status, 0)); + EXPECT_NE(0, WIFSIGNALED(status)); + EXPECT_EQ(SIGTERM, WTERMSIG(status)); + + /* Take snapshot to CPU pagetables */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + m = buffer->mirror; + for (i = 0; i < npages; i++) + ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[i]); + + hmm_buffer_free(buffer); +} +TEST_HARNESS_MAIN --- /dev/null +++ a/tools/testing/selftests/mm/hugepage-mmap.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * hugepage-mmap: + * + * Example of using huge page memory in a user application using the mmap + * system call. Before running this application, make sure that the + * administrator has mounted the hugetlbfs filesystem (on some directory + * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this + * example, the app is requesting memory of size 256MB that is backed by + * huge pages. + * + * For the ia64 architecture, the Linux kernel reserves Region number 4 for + * huge pages. That means that if one requires a fixed address, a huge page + * aligned address starting with 0x800000... will be required. If a fixed + * address is not required, the kernel will select an address in the proper + * range. + * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. + */ +#define _GNU_SOURCE +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/mman.h> +#include <fcntl.h> + +#define LENGTH (256UL*1024*1024) +#define PROTECTION (PROT_READ | PROT_WRITE) + +/* Only ia64 requires this */ +#ifdef __ia64__ +#define ADDR (void *)(0x8000000000000000UL) +#define FLAGS (MAP_SHARED | MAP_FIXED) +#else +#define ADDR (void *)(0x0UL) +#define FLAGS (MAP_SHARED) +#endif + +static void check_bytes(char *addr) +{ + printf("First hex is %x\n", *((unsigned int *)addr)); +} + +static void write_bytes(char *addr) +{ + unsigned long i; + + for (i = 0; i < LENGTH; i++) + *(addr + i) = (char)i; +} + +static int read_bytes(char *addr) +{ + unsigned long i; + + check_bytes(addr); + for (i = 0; i < LENGTH; i++) + if (*(addr + i) != (char)i) { + printf("Mismatch at %lu\n", i); + return 1; + } + return 0; +} + +int main(void) +{ + void *addr; + int fd, ret; + + fd = memfd_create("hugepage-mmap", MFD_HUGETLB); + if (fd < 0) { + perror("memfd_create() failed"); + exit(1); + } + + addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + close(fd); + exit(1); + } + + printf("Returned address is %p\n", addr); + check_bytes(addr); + write_bytes(addr); + ret = read_bytes(addr); + + munmap(addr, LENGTH); + close(fd); + + return ret; +} --- /dev/null +++ a/tools/testing/selftests/mm/hugepage-mremap.c @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * hugepage-mremap: + * + * Example of remapping huge page memory in a user application using the + * mremap system call. The path to a file in a hugetlbfs filesystem must + * be passed as the last argument to this test. The amount of memory used + * by this test in MBs can optionally be passed as an argument. If no memory + * amount is passed, the default amount is 10MB. + * + * To make sure the test triggers pmd sharing and goes through the 'unshare' + * path in the mremap code use 1GB (1024) or more. + */ + +#define _GNU_SOURCE +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/mman.h> +#include <errno.h> +#include <fcntl.h> /* Definition of O_* constants */ +#include <sys/syscall.h> /* Definition of SYS_* constants */ +#include <linux/userfaultfd.h> +#include <sys/ioctl.h> +#include <string.h> + +#define DEFAULT_LENGTH_MB 10UL +#define MB_TO_BYTES(x) (x * 1024 * 1024) + +#define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC) +#define FLAGS (MAP_SHARED | MAP_ANONYMOUS) + +static void check_bytes(char *addr) +{ + printf("First hex is %x\n", *((unsigned int *)addr)); +} + +static void write_bytes(char *addr, size_t len) +{ + unsigned long i; + + for (i = 0; i < len; i++) + *(addr + i) = (char)i; +} + +static int read_bytes(char *addr, size_t len) +{ + unsigned long i; + + check_bytes(addr); + for (i = 0; i < len; i++) + if (*(addr + i) != (char)i) { + printf("Mismatch at %lu\n", i); + return 1; + } + return 0; +} + +static void register_region_with_uffd(char *addr, size_t len) +{ + long uffd; /* userfaultfd file descriptor */ + struct uffdio_api uffdio_api; + struct uffdio_register uffdio_register; + + /* Create and enable userfaultfd object. */ + + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd == -1) { + perror("userfaultfd"); + exit(1); + } + + uffdio_api.api = UFFD_API; + uffdio_api.features = 0; + if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { + perror("ioctl-UFFDIO_API"); + exit(1); + } + + /* Create a private anonymous mapping. The memory will be + * demand-zero paged--that is, not yet allocated. When we + * actually touch the memory, it will be allocated via + * the userfaultfd. + */ + + addr = mmap(NULL, len, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + printf("Address returned by mmap() = %p\n", addr); + + /* Register the memory range of the mapping we just created for + * handling by the userfaultfd object. In mode, we request to track + * missing pages (i.e., pages that have not yet been faulted in). + */ + + uffdio_register.range.start = (unsigned long)addr; + uffdio_register.range.len = len; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { + perror("ioctl-UFFDIO_REGISTER"); + exit(1); + } +} + +int main(int argc, char *argv[]) +{ + size_t length = 0; + int ret = 0, fd; + + if (argc >= 2 && !strcmp(argv[1], "-h")) { + printf("Usage: %s [length_in_MB]\n", argv[0]); + exit(1); + } + + /* Read memory length as the first arg if valid, otherwise fallback to + * the default length. + */ + if (argc >= 2) + length = (size_t)atoi(argv[1]); + else + length = DEFAULT_LENGTH_MB; + + length = MB_TO_BYTES(length); + fd = memfd_create(argv[0], MFD_HUGETLB); + if (fd < 0) { + perror("Open failed"); + exit(1); + } + + /* mmap to a PUD aligned address to hopefully trigger pmd sharing. */ + unsigned long suggested_addr = 0x7eaa40000000; + void *haddr = mmap((void *)suggested_addr, length, PROTECTION, + MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); + printf("Map haddr: Returned address is %p\n", haddr); + if (haddr == MAP_FAILED) { + perror("mmap1"); + exit(1); + } + + /* mmap again to a dummy address to hopefully trigger pmd sharing. */ + suggested_addr = 0x7daa40000000; + void *daddr = mmap((void *)suggested_addr, length, PROTECTION, + MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); + printf("Map daddr: Returned address is %p\n", daddr); + if (daddr == MAP_FAILED) { + perror("mmap3"); + exit(1); + } + + suggested_addr = 0x7faa40000000; + void *vaddr = + mmap((void *)suggested_addr, length, PROTECTION, FLAGS, -1, 0); + printf("Map vaddr: Returned address is %p\n", vaddr); + if (vaddr == MAP_FAILED) { + perror("mmap2"); + exit(1); + } + + register_region_with_uffd(haddr, length); + + void *addr = mremap(haddr, length, length, + MREMAP_MAYMOVE | MREMAP_FIXED, vaddr); + if (addr == MAP_FAILED) { + perror("mremap"); + exit(1); + } + + printf("Mremap: Returned address is %p\n", addr); + check_bytes(addr); + write_bytes(addr, length); + ret = read_bytes(addr, length); + + munmap(addr, length); + + addr = mremap(addr, length, length, 0); + if (addr != MAP_FAILED) { + printf("mremap: Expected failure, but call succeeded\n"); + exit(1); + } + + close(fd); + + return ret; +} --- /dev/null +++ a/tools/testing/selftests/mm/hugepage-shm.c @@ -0,0 +1,101 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * hugepage-shm: + * + * Example of using huge page memory in a user application using Sys V shared + * memory system calls. In this example the app is requesting 256MB of + * memory that is backed by huge pages. The application uses the flag + * SHM_HUGETLB in the shmget system call to inform the kernel that it is + * requesting huge pages. + * + * For the ia64 architecture, the Linux kernel reserves Region number 4 for + * huge pages. That means that if one requires a fixed address, a huge page + * aligned address starting with 0x800000... will be required. If a fixed + * address is not required, the kernel will select an address in the proper + * range. + * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. + * + * Note: The default shared memory limit is quite low on many kernels, + * you may need to increase it via: + * + * echo 268435456 > /proc/sys/kernel/shmmax + * + * This will increase the maximum size per shared memory segment to 256MB. + * The other limit that you will hit eventually is shmall which is the + * total amount of shared memory in pages. To set it to 16GB on a system + * with a 4kB pagesize do: + * + * echo 4194304 > /proc/sys/kernel/shmall + */ + +#include <stdlib.h> +#include <stdio.h> +#include <sys/types.h> +#include <sys/ipc.h> +#include <sys/shm.h> +#include <sys/mman.h> + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + +#define LENGTH (256UL*1024*1024) + +#define dprintf(x) printf(x) + +/* Only ia64 requires this */ +#ifdef __ia64__ +#define ADDR (void *)(0x8000000000000000UL) +#define SHMAT_FLAGS (SHM_RND) +#else +#define ADDR (void *)(0x0UL) +#define SHMAT_FLAGS (0) +#endif + +int main(void) +{ + int shmid; + unsigned long i; + char *shmaddr; + + shmid = shmget(2, LENGTH, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); + if (shmid < 0) { + perror("shmget"); + exit(1); + } + printf("shmid: 0x%x\n", shmid); + + shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS); + if (shmaddr == (char *)-1) { + perror("Shared memory attach failure"); + shmctl(shmid, IPC_RMID, NULL); + exit(2); + } + printf("shmaddr: %p\n", shmaddr); + + dprintf("Starting the writes:\n"); + for (i = 0; i < LENGTH; i++) { + shmaddr[i] = (char)(i); + if (!(i % (1024 * 1024))) + dprintf("."); + } + dprintf("\n"); + + dprintf("Starting the Check..."); + for (i = 0; i < LENGTH; i++) + if (shmaddr[i] != (char)i) { + printf("\nIndex %lu mismatched\n", i); + exit(3); + } + dprintf("Done.\n"); + + if (shmdt((const void *)shmaddr) != 0) { + perror("Detach failure"); + shmctl(shmid, IPC_RMID, NULL); + exit(4); + } + + shmctl(shmid, IPC_RMID, NULL); + + return 0; +} --- /dev/null +++ a/tools/testing/selftests/mm/hugepage-vmemmap.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test case of using hugepage memory in a user application using the + * mmap system call with MAP_HUGETLB flag. Before running this program + * make sure the administrator has allocated enough default sized huge + * pages to cover the 2 MB allocation. + */ +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/mman.h> +#include <fcntl.h> + +#define MAP_LENGTH (2UL * 1024 * 1024) + +#ifndef MAP_HUGETLB +#define MAP_HUGETLB 0x40000 /* arch specific */ +#endif + +#define PAGE_SIZE 4096 + +#define PAGE_COMPOUND_HEAD (1UL << 15) +#define PAGE_COMPOUND_TAIL (1UL << 16) +#define PAGE_HUGE (1UL << 17) + +#define HEAD_PAGE_FLAGS (PAGE_COMPOUND_HEAD | PAGE_HUGE) +#define TAIL_PAGE_FLAGS (PAGE_COMPOUND_TAIL | PAGE_HUGE) + +#define PM_PFRAME_BITS 55 +#define PM_PFRAME_MASK ~((1UL << PM_PFRAME_BITS) - 1) + +/* + * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. + * That means the addresses starting with 0x800000... will need to be + * specified. Specifying a fixed address is not required on ppc64, i386 + * or x86_64. + */ +#ifdef __ia64__ +#define MAP_ADDR (void *)(0x8000000000000000UL) +#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) +#else +#define MAP_ADDR NULL +#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) +#endif + +static void write_bytes(char *addr, size_t length) +{ + unsigned long i; + + for (i = 0; i < length; i++) + *(addr + i) = (char)i; +} + +static unsigned long virt_to_pfn(void *addr) +{ + int fd; + unsigned long pagemap; + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) + return -1UL; + + lseek(fd, (unsigned long)addr / PAGE_SIZE * sizeof(pagemap), SEEK_SET); + read(fd, &pagemap, sizeof(pagemap)); + close(fd); + + return pagemap & ~PM_PFRAME_MASK; +} + +static int check_page_flags(unsigned long pfn) +{ + int fd, i; + unsigned long pageflags; + + fd = open("/proc/kpageflags", O_RDONLY); + if (fd < 0) + return -1; + + lseek(fd, pfn * sizeof(pageflags), SEEK_SET); + + read(fd, &pageflags, sizeof(pageflags)); + if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) { + close(fd); + printf("Head page flags (%lx) is invalid\n", pageflags); + return -1; + } + + /* + * pages other than the first page must be tail and shouldn't be head; + * this also verifies kernel has correctly set the fake page_head to tail + * while hugetlb_free_vmemmap is enabled. + */ + for (i = 1; i < MAP_LENGTH / PAGE_SIZE; i++) { + read(fd, &pageflags, sizeof(pageflags)); + if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS || + (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) { + close(fd); + printf("Tail page flags (%lx) is invalid\n", pageflags); + return -1; + } + } + + close(fd); + + return 0; +} + +int main(int argc, char **argv) +{ + void *addr; + unsigned long pfn; + + addr = mmap(MAP_ADDR, MAP_LENGTH, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* Trigger allocation of HugeTLB page. */ + write_bytes(addr, MAP_LENGTH); + + pfn = virt_to_pfn(addr); + if (pfn == -1UL) { + munmap(addr, MAP_LENGTH); + perror("virt_to_pfn"); + exit(1); + } + + printf("Returned address is %p whose pfn is %lx\n", addr, pfn); + + if (check_page_flags(pfn) < 0) { + munmap(addr, MAP_LENGTH); + perror("check_page_flags"); + exit(1); + } + + /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ + if (munmap(addr, MAP_LENGTH)) { + perror("munmap"); + exit(1); + } + + return 0; +} --- /dev/null +++ a/tools/testing/selftests/mm/hugetlb-madvise.c @@ -0,0 +1,406 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * hugepage-madvise: + * + * Basic functional testing of madvise MADV_DONTNEED and MADV_REMOVE + * on hugetlb mappings. + * + * Before running this test, make sure the administrator has pre-allocated + * at least MIN_FREE_PAGES hugetlb pages and they are free. In addition, + * the test takes an argument that is the path to a file in a hugetlbfs + * filesystem. Therefore, a hugetlbfs filesystem must be mounted on some + * directory. + */ + +#define _GNU_SOURCE +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/mman.h> +#define __USE_GNU +#include <fcntl.h> + +#define MIN_FREE_PAGES 20 +#define NR_HUGE_PAGES 10 /* common number of pages to map/allocate */ + +#define validate_free_pages(exp_free) \ + do { \ + int fhp = get_free_hugepages(); \ + if (fhp != (exp_free)) { \ + printf("Unexpected number of free huge " \ + "pages line %d\n", __LINE__); \ + exit(1); \ + } \ + } while (0) + +unsigned long huge_page_size; +unsigned long base_page_size; + +/* + * default_huge_page_size copied from mlock2-tests.c + */ +unsigned long default_huge_page_size(void) +{ + unsigned long hps = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return 0; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { + hps <<= 10; + break; + } + } + + free(line); + fclose(f); + return hps; +} + +unsigned long get_free_hugepages(void) +{ + unsigned long fhp = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return fhp; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "HugePages_Free: %lu", &fhp) == 1) + break; + } + + free(line); + fclose(f); + return fhp; +} + +void write_fault_pages(void *addr, unsigned long nr_pages) +{ + unsigned long i; + + for (i = 0; i < nr_pages; i++) + *((unsigned long *)(addr + (i * huge_page_size))) = i; +} + +void read_fault_pages(void *addr, unsigned long nr_pages) +{ + unsigned long dummy = 0; + unsigned long i; + + for (i = 0; i < nr_pages; i++) + dummy += *((unsigned long *)(addr + (i * huge_page_size))); +} + +int main(int argc, char **argv) +{ + unsigned long free_hugepages; + void *addr, *addr2; + int fd; + int ret; + + huge_page_size = default_huge_page_size(); + if (!huge_page_size) { + printf("Unable to determine huge page size, exiting!\n"); + exit(1); + } + base_page_size = sysconf(_SC_PAGE_SIZE); + if (!huge_page_size) { + printf("Unable to determine base page size, exiting!\n"); + exit(1); + } + + free_hugepages = get_free_hugepages(); + if (free_hugepages < MIN_FREE_PAGES) { + printf("Not enough free huge pages to test, exiting!\n"); + exit(1); + } + + fd = memfd_create(argv[0], MFD_HUGETLB); + if (fd < 0) { + perror("memfd_create() failed"); + exit(1); + } + + /* + * Test validity of MADV_DONTNEED addr and length arguments. mmap + * size is NR_HUGE_PAGES + 2. One page at the beginning and end of + * the mapping will be unmapped so we KNOW there is nothing mapped + * there. + */ + addr = mmap(NULL, (NR_HUGE_PAGES + 2) * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + if (munmap(addr, huge_page_size) || + munmap(addr + (NR_HUGE_PAGES + 1) * huge_page_size, + huge_page_size)) { + perror("munmap"); + exit(1); + } + addr = addr + huge_page_size; + + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* addr before mapping should fail */ + ret = madvise(addr - base_page_size, NR_HUGE_PAGES * huge_page_size, + MADV_DONTNEED); + if (!ret) { + printf("Unexpected success of madvise call with invalid addr line %d\n", + __LINE__); + exit(1); + } + + /* addr + length after mapping should fail */ + ret = madvise(addr, (NR_HUGE_PAGES * huge_page_size) + base_page_size, + MADV_DONTNEED); + if (!ret) { + printf("Unexpected success of madvise call with invalid length line %d\n", + __LINE__); + exit(1); + } + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test alignment of MADV_DONTNEED addr and length arguments + */ + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* addr is not huge page size aligned and should fail */ + ret = madvise(addr + base_page_size, + NR_HUGE_PAGES * huge_page_size - base_page_size, + MADV_DONTNEED); + if (!ret) { + printf("Unexpected success of madvise call with unaligned start address %d\n", + __LINE__); + exit(1); + } + + /* addr + length should be aligned down to huge page size */ + if (madvise(addr, + ((NR_HUGE_PAGES - 1) * huge_page_size) + base_page_size, + MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + + /* should free all but last page in mapping */ + validate_free_pages(free_hugepages - 1); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + validate_free_pages(free_hugepages); + + /* + * Test MADV_DONTNEED on anonymous private mapping + */ + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, + -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + + /* should free all pages in mapping */ + validate_free_pages(free_hugepages); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test MADV_DONTNEED on private mapping of hugetlb file + */ + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* read should not consume any pages */ + read_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* madvise should not free any pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* writes should allocate private pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* madvise should free private pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* writes should allocate private pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* + * The fallocate below certainly should free the pages associated + * with the file. However, pages in the private mapping are also + * freed. This is not the 'correct' behavior, but is expected + * because this is how it has worked since the initial hugetlb + * implementation. + */ + if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test MADV_DONTNEED on shared mapping of hugetlb file + */ + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* write should not consume any pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* madvise should not free any pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* + * Test MADV_REMOVE on shared mapping of hugetlb file + * + * madvise is same as hole punch and should free all pages. + */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages); + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + + /* + * Test MADV_REMOVE on shared and private mapping of hugetlb file + */ + if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { + perror("fallocate"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* shared write should not consume any additional pages */ + write_fault_pages(addr, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + addr2 = mmap(NULL, NR_HUGE_PAGES * huge_page_size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, fd, 0); + if (addr2 == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + /* private read should not consume any pages */ + read_fault_pages(addr2, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* private write should consume additional pages */ + write_fault_pages(addr2, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* madvise of shared mapping should not free any pages */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* madvise of private mapping should free private pages */ + if (madvise(addr2, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages - NR_HUGE_PAGES); + + /* private write should consume additional pages again */ + write_fault_pages(addr2, NR_HUGE_PAGES); + validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); + + /* + * madvise should free both file and private pages although this is + * not correct. private pages should not be freed, but this is + * expected. See comment associated with FALLOC_FL_PUNCH_HOLE call. + */ + if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) { + perror("madvise"); + exit(1); + } + validate_free_pages(free_hugepages); + + (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); + (void)munmap(addr2, NR_HUGE_PAGES * huge_page_size); + + close(fd); + return 0; +} --- /dev/null +++ a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh @@ -0,0 +1,252 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +set -e + +if [[ $(id -u) -ne 0 ]]; then + echo "This test must be run as root. Skipping..." + exit $ksft_skip +fi + +usage_file=usage_in_bytes + +if [[ "$1" == "-cgroup-v2" ]]; then + cgroup2=1 + usage_file=current +fi + + +if [[ $cgroup2 ]]; then + CGROUP_ROOT=$(mount -t cgroup2 | head -1 | awk -e '{print $3}') + if [[ -z "$CGROUP_ROOT" ]]; then + CGROUP_ROOT=/dev/cgroup/memory + mount -t cgroup2 none $CGROUP_ROOT + do_umount=1 + fi + echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control +else + CGROUP_ROOT=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}') + if [[ -z "$CGROUP_ROOT" ]]; then + CGROUP_ROOT=/dev/cgroup/memory + mount -t cgroup memory,hugetlb $CGROUP_ROOT + do_umount=1 + fi +fi +MNT='/mnt/huge/' + +function get_machine_hugepage_size() { + hpz=$(grep -i hugepagesize /proc/meminfo) + kb=${hpz:14:-3} + mb=$(($kb / 1024)) + echo $mb +} + +MB=$(get_machine_hugepage_size) + +function cleanup() { + echo cleanup + set +e + rm -rf "$MNT"/* 2>/dev/null + umount "$MNT" 2>/dev/null + rmdir "$MNT" 2>/dev/null + rmdir "$CGROUP_ROOT"/a/b 2>/dev/null + rmdir "$CGROUP_ROOT"/a 2>/dev/null + rmdir "$CGROUP_ROOT"/test1 2>/dev/null + echo 0 >/proc/sys/vm/nr_hugepages + set -e +} + +function assert_state() { + local expected_a="$1" + local expected_a_hugetlb="$2" + local expected_b="" + local expected_b_hugetlb="" + + if [ ! -z ${3:-} ] && [ ! -z ${4:-} ]; then + expected_b="$3" + expected_b_hugetlb="$4" + fi + local tolerance=$((5 * 1024 * 1024)) + + local actual_a + actual_a="$(cat "$CGROUP_ROOT"/a/memory.$usage_file)" + if [[ $actual_a -lt $(($expected_a - $tolerance)) ]] || + [[ $actual_a -gt $(($expected_a + $tolerance)) ]]; then + echo actual a = $((${actual_a%% *} / 1024 / 1024)) MB + echo expected a = $((${expected_a%% *} / 1024 / 1024)) MB + echo fail + + cleanup + exit 1 + fi + + local actual_a_hugetlb + actual_a_hugetlb="$(cat "$CGROUP_ROOT"/a/hugetlb.${MB}MB.$usage_file)" + if [[ $actual_a_hugetlb -lt $(($expected_a_hugetlb - $tolerance)) ]] || + [[ $actual_a_hugetlb -gt $(($expected_a_hugetlb + $tolerance)) ]]; then + echo actual a hugetlb = $((${actual_a_hugetlb%% *} / 1024 / 1024)) MB + echo expected a hugetlb = $((${expected_a_hugetlb%% *} / 1024 / 1024)) MB + echo fail + + cleanup + exit 1 + fi + + if [[ -z "$expected_b" || -z "$expected_b_hugetlb" ]]; then + return + fi + + local actual_b + actual_b="$(cat "$CGROUP_ROOT"/a/b/memory.$usage_file)" + if [[ $actual_b -lt $(($expected_b - $tolerance)) ]] || + [[ $actual_b -gt $(($expected_b + $tolerance)) ]]; then + echo actual b = $((${actual_b%% *} / 1024 / 1024)) MB + echo expected b = $((${expected_b%% *} / 1024 / 1024)) MB + echo fail + + cleanup + exit 1 + fi + + local actual_b_hugetlb + actual_b_hugetlb="$(cat "$CGROUP_ROOT"/a/b/hugetlb.${MB}MB.$usage_file)" + if [[ $actual_b_hugetlb -lt $(($expected_b_hugetlb - $tolerance)) ]] || + [[ $actual_b_hugetlb -gt $(($expected_b_hugetlb + $tolerance)) ]]; then + echo actual b hugetlb = $((${actual_b_hugetlb%% *} / 1024 / 1024)) MB + echo expected b hugetlb = $((${expected_b_hugetlb%% *} / 1024 / 1024)) MB + echo fail + + cleanup + exit 1 + fi +} + +function setup() { + echo 100 >/proc/sys/vm/nr_hugepages + mkdir "$CGROUP_ROOT"/a + sleep 1 + if [[ $cgroup2 ]]; then + echo "+hugetlb +memory" >$CGROUP_ROOT/a/cgroup.subtree_control + else + echo 0 >$CGROUP_ROOT/a/cpuset.mems + echo 0 >$CGROUP_ROOT/a/cpuset.cpus + fi + + mkdir "$CGROUP_ROOT"/a/b + + if [[ ! $cgroup2 ]]; then + echo 0 >$CGROUP_ROOT/a/b/cpuset.mems + echo 0 >$CGROUP_ROOT/a/b/cpuset.cpus + fi + + mkdir -p "$MNT" + mount -t hugetlbfs none "$MNT" +} + +write_hugetlbfs() { + local cgroup="$1" + local path="$2" + local size="$3" + + if [[ $cgroup2 ]]; then + echo $$ >$CGROUP_ROOT/$cgroup/cgroup.procs + else + echo 0 >$CGROUP_ROOT/$cgroup/cpuset.mems + echo 0 >$CGROUP_ROOT/$cgroup/cpuset.cpus + echo $$ >"$CGROUP_ROOT/$cgroup/tasks" + fi + ./write_to_hugetlbfs -p "$path" -s "$size" -m 0 -o + if [[ $cgroup2 ]]; then + echo $$ >$CGROUP_ROOT/cgroup.procs + else + echo $$ >"$CGROUP_ROOT/tasks" + fi + echo +} + +set -e + +size=$((${MB} * 1024 * 1024 * 25)) # 50MB = 25 * 2MB hugepages. + +cleanup + +echo +echo +echo Test charge, rmdir, uncharge +setup +echo mkdir +mkdir $CGROUP_ROOT/test1 + +echo write +write_hugetlbfs test1 "$MNT"/test $size + +echo rmdir +rmdir $CGROUP_ROOT/test1 +mkdir $CGROUP_ROOT/test1 + +echo uncharge +rm -rf /mnt/huge/* + +cleanup + +echo done +echo +echo +if [[ ! $cgroup2 ]]; then + echo "Test parent and child hugetlb usage" + setup + + echo write + write_hugetlbfs a "$MNT"/test $size + + echo Assert memory charged correctly for parent use. + assert_state 0 $size 0 0 + + write_hugetlbfs a/b "$MNT"/test2 $size + + echo Assert memory charged correctly for child use. + assert_state 0 $(($size * 2)) 0 $size + + rmdir "$CGROUP_ROOT"/a/b + sleep 5 + echo Assert memory reparent correctly. + assert_state 0 $(($size * 2)) + + rm -rf "$MNT"/* + umount "$MNT" + echo Assert memory uncharged correctly. + assert_state 0 0 + + cleanup +fi + +echo +echo +echo "Test child only hugetlb usage" +echo setup +setup + +echo write +write_hugetlbfs a/b "$MNT"/test2 $size + +echo Assert memory charged correctly for child only use. +assert_state 0 $(($size)) 0 $size + +rmdir "$CGROUP_ROOT"/a/b +echo Assert memory reparent correctly. +assert_state 0 $size + +rm -rf "$MNT"/* +umount "$MNT" +echo Assert memory uncharged correctly. +assert_state 0 0 + +cleanup + +echo ALL PASS + +umount $CGROUP_ROOT +rm -rf $CGROUP_ROOT --- /dev/null +++ a/tools/testing/selftests/mm/khugepaged.c @@ -0,0 +1,1558 @@ +#define _GNU_SOURCE +#include <ctype.h> +#include <errno.h> +#include <fcntl.h> +#include <limits.h> +#include <dirent.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> + +#include <sys/mman.h> +#include <sys/wait.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/sysmacros.h> +#include <sys/vfs.h> + +#include "linux/magic.h" + +#include "vm_util.h" + +#ifndef MADV_PAGEOUT +#define MADV_PAGEOUT 21 +#endif +#ifndef MADV_POPULATE_READ +#define MADV_POPULATE_READ 22 +#endif +#ifndef MADV_COLLAPSE +#define MADV_COLLAPSE 25 +#endif + +#define BASE_ADDR ((void *)(1UL << 30)) +static unsigned long hpage_pmd_size; +static unsigned long page_size; +static int hpage_pmd_nr; + +#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" +#define PID_SMAPS "/proc/self/smaps" +#define TEST_FILE "collapse_test_file" + +#define MAX_LINE_LENGTH 500 + +enum vma_type { + VMA_ANON, + VMA_FILE, + VMA_SHMEM, +}; + +struct mem_ops { + void *(*setup_area)(int nr_hpages); + void (*cleanup_area)(void *p, unsigned long size); + void (*fault)(void *p, unsigned long start, unsigned long end); + bool (*check_huge)(void *addr, int nr_hpages); + const char *name; +}; + +static struct mem_ops *file_ops; +static struct mem_ops *anon_ops; +static struct mem_ops *shmem_ops; + +struct collapse_context { + void (*collapse)(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect); + bool enforce_pte_scan_limits; + const char *name; +}; + +static struct collapse_context *khugepaged_context; +static struct collapse_context *madvise_context; + +struct file_info { + const char *dir; + char path[PATH_MAX]; + enum vma_type type; + int fd; + char dev_queue_read_ahead_path[PATH_MAX]; +}; + +static struct file_info finfo; + +enum thp_enabled { + THP_ALWAYS, + THP_MADVISE, + THP_NEVER, +}; + +static const char *thp_enabled_strings[] = { + "always", + "madvise", + "never", + NULL +}; + +enum thp_defrag { + THP_DEFRAG_ALWAYS, + THP_DEFRAG_DEFER, + THP_DEFRAG_DEFER_MADVISE, + THP_DEFRAG_MADVISE, + THP_DEFRAG_NEVER, +}; + +static const char *thp_defrag_strings[] = { + "always", + "defer", + "defer+madvise", + "madvise", + "never", + NULL +}; + +enum shmem_enabled { + SHMEM_ALWAYS, + SHMEM_WITHIN_SIZE, + SHMEM_ADVISE, + SHMEM_NEVER, + SHMEM_DENY, + SHMEM_FORCE, +}; + +static const char *shmem_enabled_strings[] = { + "always", + "within_size", + "advise", + "never", + "deny", + "force", + NULL +}; + +struct khugepaged_settings { + bool defrag; + unsigned int alloc_sleep_millisecs; + unsigned int scan_sleep_millisecs; + unsigned int max_ptes_none; + unsigned int max_ptes_swap; + unsigned int max_ptes_shared; + unsigned long pages_to_scan; +}; + +struct settings { + enum thp_enabled thp_enabled; + enum thp_defrag thp_defrag; + enum shmem_enabled shmem_enabled; + bool use_zero_page; + struct khugepaged_settings khugepaged; + unsigned long read_ahead_kb; +}; + +static struct settings saved_settings; +static bool skip_settings_restore; + +static int exit_status; + +static void success(const char *msg) +{ + printf(" \e[32m%s\e[0m\n", msg); +} + +static void fail(const char *msg) +{ + printf(" \e[31m%s\e[0m\n", msg); + exit_status++; +} + +static void skip(const char *msg) +{ + printf(" \e[33m%s\e[0m\n", msg); +} + +static int read_file(const char *path, char *buf, size_t buflen) +{ + int fd; + ssize_t numread; + + fd = open(path, O_RDONLY); + if (fd == -1) + return 0; + + numread = read(fd, buf, buflen - 1); + if (numread < 1) { + close(fd); + return 0; + } + + buf[numread] = '\0'; + close(fd); + + return (unsigned int) numread; +} + +static int write_file(const char *path, const char *buf, size_t buflen) +{ + int fd; + ssize_t numwritten; + + fd = open(path, O_WRONLY); + if (fd == -1) { + printf("open(%s)\n", path); + exit(EXIT_FAILURE); + return 0; + } + + numwritten = write(fd, buf, buflen - 1); + close(fd); + if (numwritten < 1) { + printf("write(%s)\n", buf); + exit(EXIT_FAILURE); + return 0; + } + + return (unsigned int) numwritten; +} + +static int read_string(const char *name, const char *strings[]) +{ + char path[PATH_MAX]; + char buf[256]; + char *c; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!read_file(path, buf, sizeof(buf))) { + perror(path); + exit(EXIT_FAILURE); + } + + c = strchr(buf, '['); + if (!c) { + printf("%s: Parse failure\n", __func__); + exit(EXIT_FAILURE); + } + + c++; + memmove(buf, c, sizeof(buf) - (c - buf)); + + c = strchr(buf, ']'); + if (!c) { + printf("%s: Parse failure\n", __func__); + exit(EXIT_FAILURE); + } + *c = '\0'; + + ret = 0; + while (strings[ret]) { + if (!strcmp(strings[ret], buf)) + return ret; + ret++; + } + + printf("Failed to parse %s\n", name); + exit(EXIT_FAILURE); +} + +static void write_string(const char *name, const char *val) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!write_file(path, val, strlen(val) + 1)) { + perror(path); + exit(EXIT_FAILURE); + } +} + +static const unsigned long _read_num(const char *path) +{ + char buf[21]; + + if (read_file(path, buf, sizeof(buf)) < 0) { + perror("read_file(read_num)"); + exit(EXIT_FAILURE); + } + + return strtoul(buf, NULL, 10); +} + +static const unsigned long read_num(const char *name) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + return _read_num(path); +} + +static void _write_num(const char *path, unsigned long num) +{ + char buf[21]; + + sprintf(buf, "%ld", num); + if (!write_file(path, buf, strlen(buf) + 1)) { + perror(path); + exit(EXIT_FAILURE); + } +} + +static void write_num(const char *name, unsigned long num) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + _write_num(path, num); +} + +static void write_settings(struct settings *settings) +{ + struct khugepaged_settings *khugepaged = &settings->khugepaged; + + write_string("enabled", thp_enabled_strings[settings->thp_enabled]); + write_string("defrag", thp_defrag_strings[settings->thp_defrag]); + write_string("shmem_enabled", + shmem_enabled_strings[settings->shmem_enabled]); + write_num("use_zero_page", settings->use_zero_page); + + write_num("khugepaged/defrag", khugepaged->defrag); + write_num("khugepaged/alloc_sleep_millisecs", + khugepaged->alloc_sleep_millisecs); + write_num("khugepaged/scan_sleep_millisecs", + khugepaged->scan_sleep_millisecs); + write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none); + write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap); + write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared); + write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); + + if (file_ops && finfo.type == VMA_FILE) + _write_num(finfo.dev_queue_read_ahead_path, + settings->read_ahead_kb); +} + +#define MAX_SETTINGS_DEPTH 4 +static struct settings settings_stack[MAX_SETTINGS_DEPTH]; +static int settings_index; + +static struct settings *current_settings(void) +{ + if (!settings_index) { + printf("Fail: No settings set"); + exit(EXIT_FAILURE); + } + return settings_stack + settings_index - 1; +} + +static void push_settings(struct settings *settings) +{ + if (settings_index >= MAX_SETTINGS_DEPTH) { + printf("Fail: Settings stack exceeded"); + exit(EXIT_FAILURE); + } + settings_stack[settings_index++] = *settings; + write_settings(current_settings()); +} + +static void pop_settings(void) +{ + if (settings_index <= 0) { + printf("Fail: Settings stack empty"); + exit(EXIT_FAILURE); + } + --settings_index; + write_settings(current_settings()); +} + +static void restore_settings(int sig) +{ + if (skip_settings_restore) + goto out; + + printf("Restore THP and khugepaged settings..."); + write_settings(&saved_settings); + success("OK"); + if (sig) + exit(EXIT_FAILURE); +out: + exit(exit_status); +} + +static void save_settings(void) +{ + printf("Save THP and khugepaged settings..."); + saved_settings = (struct settings) { + .thp_enabled = read_string("enabled", thp_enabled_strings), + .thp_defrag = read_string("defrag", thp_defrag_strings), + .shmem_enabled = + read_string("shmem_enabled", shmem_enabled_strings), + .use_zero_page = read_num("use_zero_page"), + }; + saved_settings.khugepaged = (struct khugepaged_settings) { + .defrag = read_num("khugepaged/defrag"), + .alloc_sleep_millisecs = + read_num("khugepaged/alloc_sleep_millisecs"), + .scan_sleep_millisecs = + read_num("khugepaged/scan_sleep_millisecs"), + .max_ptes_none = read_num("khugepaged/max_ptes_none"), + .max_ptes_swap = read_num("khugepaged/max_ptes_swap"), + .max_ptes_shared = read_num("khugepaged/max_ptes_shared"), + .pages_to_scan = read_num("khugepaged/pages_to_scan"), + }; + if (file_ops && finfo.type == VMA_FILE) + saved_settings.read_ahead_kb = + _read_num(finfo.dev_queue_read_ahead_path); + + success("OK"); + + signal(SIGTERM, restore_settings); + signal(SIGINT, restore_settings); + signal(SIGHUP, restore_settings); + signal(SIGQUIT, restore_settings); +} + +static void get_finfo(const char *dir) +{ + struct stat path_stat; + struct statfs fs; + char buf[1 << 10]; + char path[PATH_MAX]; + char *str, *end; + + finfo.dir = dir; + stat(finfo.dir, &path_stat); + if (!S_ISDIR(path_stat.st_mode)) { + printf("%s: Not a directory (%s)\n", __func__, finfo.dir); + exit(EXIT_FAILURE); + } + if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE, + finfo.dir) >= sizeof(finfo.path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + if (statfs(finfo.dir, &fs)) { + perror("statfs()"); + exit(EXIT_FAILURE); + } + finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE; + if (finfo.type == VMA_SHMEM) + return; + + /* Find owning device's queue/read_ahead_kb control */ + if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent", + major(path_stat.st_dev), minor(path_stat.st_dev)) + >= sizeof(path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + if (read_file(path, buf, sizeof(buf)) < 0) { + perror("read_file(read_num)"); + exit(EXIT_FAILURE); + } + if (strstr(buf, "DEVTYPE=disk")) { + /* Found it */ + if (snprintf(finfo.dev_queue_read_ahead_path, + sizeof(finfo.dev_queue_read_ahead_path), + "/sys/dev/block/%d:%d/queue/read_ahead_kb", + major(path_stat.st_dev), minor(path_stat.st_dev)) + >= sizeof(finfo.dev_queue_read_ahead_path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + return; + } + if (!strstr(buf, "DEVTYPE=partition")) { + printf("%s: Unknown device type: %s\n", __func__, path); + exit(EXIT_FAILURE); + } + /* + * Partition of block device - need to find actual device. + * Using naming convention that devnameN is partition of + * device devname. + */ + str = strstr(buf, "DEVNAME="); + if (!str) { + printf("%s: Could not read: %s", __func__, path); + exit(EXIT_FAILURE); + } + str += 8; + end = str; + while (*end) { + if (isdigit(*end)) { + *end = '\0'; + if (snprintf(finfo.dev_queue_read_ahead_path, + sizeof(finfo.dev_queue_read_ahead_path), + "/sys/block/%s/queue/read_ahead_kb", + str) >= sizeof(finfo.dev_queue_read_ahead_path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + return; + } + ++end; + } + printf("%s: Could not read: %s\n", __func__, path); + exit(EXIT_FAILURE); +} + +static bool check_swap(void *addr, unsigned long size) +{ + bool swap = false; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + char addr_pattern[MAX_LINE_LENGTH]; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", + (unsigned long) addr); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + + + fp = fopen(PID_SMAPS, "r"); + if (!fp) { + printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); + exit(EXIT_FAILURE); + } + if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) + goto err_out; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB", + size >> 10); + if (ret >= MAX_LINE_LENGTH) { + printf("%s: Pattern is too long\n", __func__); + exit(EXIT_FAILURE); + } + /* + * Fetch the Swap: in the same block and check whether it got + * the expected number of hugeepages next. + */ + if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer))) + goto err_out; + + if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) + goto err_out; + + swap = true; +err_out: + fclose(fp); + return swap; +} + +static void *alloc_mapping(int nr) +{ + void *p; + + p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (p != BASE_ADDR) { + printf("Failed to allocate VMA at %p\n", BASE_ADDR); + exit(EXIT_FAILURE); + } + + return p; +} + +static void fill_memory(int *p, unsigned long start, unsigned long end) +{ + int i; + + for (i = start / page_size; i < end / page_size; i++) + p[i * page_size / sizeof(*p)] = i + 0xdead0000; +} + +/* + * MADV_COLLAPSE is a best-effort request and may fail if an internal + * resource is temporarily unavailable, in which case it will set errno to + * EAGAIN. In such a case, immediately reattempt the operation one more + * time. + */ +static int madvise_collapse_retry(void *p, unsigned long size) +{ + bool retry = true; + int ret; + +retry: + ret = madvise(p, size, MADV_COLLAPSE); + if (ret && errno == EAGAIN && retry) { + retry = false; + goto retry; + } + return ret; +} + +/* + * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with + * validate_memory()'able contents. + */ +static void *alloc_hpage(struct mem_ops *ops) +{ + void *p = ops->setup_area(1); + + ops->fault(p, 0, hpage_pmd_size); + + /* + * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE. + * The latter is ineligible for collapse by MADV_COLLAPSE + * while the former might cause MADV_COLLAPSE to race with + * khugepaged on low-load system (like a test machine), which + * would cause MADV_COLLAPSE to fail with EAGAIN. + */ + printf("Allocate huge page..."); + if (madvise_collapse_retry(p, hpage_pmd_size)) { + perror("madvise(MADV_COLLAPSE)"); + exit(EXIT_FAILURE); + } + if (!ops->check_huge(p, 1)) { + perror("madvise(MADV_COLLAPSE)"); + exit(EXIT_FAILURE); + } + if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) { + perror("madvise(MADV_HUGEPAGE)"); + exit(EXIT_FAILURE); + } + success("OK"); + return p; +} + +static void validate_memory(int *p, unsigned long start, unsigned long end) +{ + int i; + + for (i = start / page_size; i < end / page_size; i++) { + if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) { + printf("Page %d is corrupted: %#x\n", + i, p[i * page_size / sizeof(*p)]); + exit(EXIT_FAILURE); + } + } +} + +static void *anon_setup_area(int nr_hpages) +{ + return alloc_mapping(nr_hpages); +} + +static void anon_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); +} + +static void anon_fault(void *p, unsigned long start, unsigned long end) +{ + fill_memory(p, start, end); +} + +static bool anon_check_huge(void *addr, int nr_hpages) +{ + return check_huge_anon(addr, nr_hpages, hpage_pmd_size); +} + +static void *file_setup_area(int nr_hpages) +{ + int fd; + void *p; + unsigned long size; + + unlink(finfo.path); /* Cleanup from previous failed tests */ + printf("Creating %s for collapse%s...", finfo.path, + finfo.type == VMA_SHMEM ? " (tmpfs)" : ""); + fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL, + 777); + if (fd < 0) { + perror("open()"); + exit(EXIT_FAILURE); + } + + size = nr_hpages * hpage_pmd_size; + p = alloc_mapping(nr_hpages); + fill_memory(p, 0, size); + write(fd, p, size); + close(fd); + munmap(p, size); + success("OK"); + + printf("Opening %s read only for collapse...", finfo.path); + finfo.fd = open(finfo.path, O_RDONLY, 777); + if (finfo.fd < 0) { + perror("open()"); + exit(EXIT_FAILURE); + } + p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC, + MAP_PRIVATE, finfo.fd, 0); + if (p == MAP_FAILED || p != BASE_ADDR) { + perror("mmap()"); + exit(EXIT_FAILURE); + } + + /* Drop page cache */ + write_file("/proc/sys/vm/drop_caches", "3", 2); + success("OK"); + return p; +} + +static void file_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); + close(finfo.fd); + unlink(finfo.path); +} + +static void file_fault(void *p, unsigned long start, unsigned long end) +{ + if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) { + perror("madvise(MADV_POPULATE_READ"); + exit(EXIT_FAILURE); + } +} + +static bool file_check_huge(void *addr, int nr_hpages) +{ + switch (finfo.type) { + case VMA_FILE: + return check_huge_file(addr, nr_hpages, hpage_pmd_size); + case VMA_SHMEM: + return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); + default: + exit(EXIT_FAILURE); + return false; + } +} + +static void *shmem_setup_area(int nr_hpages) +{ + void *p; + unsigned long size = nr_hpages * hpage_pmd_size; + + finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0); + if (finfo.fd < 0) { + perror("memfd_create()"); + exit(EXIT_FAILURE); + } + if (ftruncate(finfo.fd, size)) { + perror("ftruncate()"); + exit(EXIT_FAILURE); + } + p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd, + 0); + if (p != BASE_ADDR) { + perror("mmap()"); + exit(EXIT_FAILURE); + } + return p; +} + +static void shmem_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); + close(finfo.fd); +} + +static bool shmem_check_huge(void *addr, int nr_hpages) +{ + return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); +} + +static struct mem_ops __anon_ops = { + .setup_area = &anon_setup_area, + .cleanup_area = &anon_cleanup_area, + .fault = &anon_fault, + .check_huge = &anon_check_huge, + .name = "anon", +}; + +static struct mem_ops __file_ops = { + .setup_area = &file_setup_area, + .cleanup_area = &file_cleanup_area, + .fault = &file_fault, + .check_huge = &file_check_huge, + .name = "file", +}; + +static struct mem_ops __shmem_ops = { + .setup_area = &shmem_setup_area, + .cleanup_area = &shmem_cleanup_area, + .fault = &anon_fault, + .check_huge = &shmem_check_huge, + .name = "shmem", +}; + +static void __madvise_collapse(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect) +{ + int ret; + struct settings settings = *current_settings(); + + printf("%s...", msg); + + /* + * Prevent khugepaged interference and tests that MADV_COLLAPSE + * ignores /sys/kernel/mm/transparent_hugepage/enabled + */ + settings.thp_enabled = THP_NEVER; + settings.shmem_enabled = SHMEM_NEVER; + push_settings(&settings); + + /* Clear VM_NOHUGEPAGE */ + madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); + ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size); + if (((bool)ret) == expect) + fail("Fail: Bad return value"); + else if (!ops->check_huge(p, expect ? nr_hpages : 0)) + fail("Fail: check_huge()"); + else + success("OK"); + + pop_settings(); +} + +static void madvise_collapse(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect) +{ + /* Sanity check */ + if (!ops->check_huge(p, 0)) { + printf("Unexpected huge page\n"); + exit(EXIT_FAILURE); + } + __madvise_collapse(msg, p, nr_hpages, ops, expect); +} + +#define TICK 500000 +static bool wait_for_scan(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops) +{ + int full_scans; + int timeout = 6; /* 3 seconds */ + + /* Sanity check */ + if (!ops->check_huge(p, 0)) { + printf("Unexpected huge page\n"); + exit(EXIT_FAILURE); + } + + madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); + + /* Wait until the second full_scan completed */ + full_scans = read_num("khugepaged/full_scans") + 2; + + printf("%s...", msg); + while (timeout--) { + if (ops->check_huge(p, nr_hpages)) + break; + if (read_num("khugepaged/full_scans") >= full_scans) + break; + printf("."); + usleep(TICK); + } + + madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE); + + return timeout == -1; +} + +static void khugepaged_collapse(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect) +{ + if (wait_for_scan(msg, p, nr_hpages, ops)) { + if (expect) + fail("Timeout"); + else + success("OK"); + return; + } + + /* + * For file and shmem memory, khugepaged only retracts pte entries after + * putting the new hugepage in the page cache. The hugepage must be + * subsequently refaulted to install the pmd mapping for the mm. + */ + if (ops != &__anon_ops) + ops->fault(p, 0, nr_hpages * hpage_pmd_size); + + if (ops->check_huge(p, expect ? nr_hpages : 0)) + success("OK"); + else + fail("Fail"); +} + +static struct collapse_context __khugepaged_context = { + .collapse = &khugepaged_collapse, + .enforce_pte_scan_limits = true, + .name = "khugepaged", +}; + +static struct collapse_context __madvise_context = { + .collapse = &madvise_collapse, + .enforce_pte_scan_limits = false, + .name = "madvise", +}; + +static bool is_tmpfs(struct mem_ops *ops) +{ + return ops == &__file_ops && finfo.type == VMA_SHMEM; +} + +static void alloc_at_fault(void) +{ + struct settings settings = *current_settings(); + char *p; + + settings.thp_enabled = THP_ALWAYS; + push_settings(&settings); + + p = alloc_mapping(1); + *p = 1; + printf("Allocate huge page on fault..."); + if (check_huge_anon(p, 1, hpage_pmd_size)) + success("OK"); + else + fail("Fail"); + + pop_settings(); + + madvise(p, page_size, MADV_DONTNEED); + printf("Split huge PMD on MADV_DONTNEED..."); + if (check_huge_anon(p, 0, hpage_pmd_size)) + success("OK"); + else + fail("Fail"); + munmap(p, hpage_pmd_size); +} + +static void collapse_full(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + int nr_hpages = 4; + unsigned long size = nr_hpages * hpage_pmd_size; + + p = ops->setup_area(nr_hpages); + ops->fault(p, 0, size); + c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages, + ops, true); + validate_memory(p, 0, size); + ops->cleanup_area(p, size); +} + +static void collapse_empty(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + + p = ops->setup_area(1); + c->collapse("Do not collapse empty PTE table", p, 1, ops, false); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + + p = ops->setup_area(1); + ops->fault(p, 0, page_size); + c->collapse("Collapse PTE table with single PTE entry present", p, + 1, ops, true); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops) +{ + int max_ptes_none = hpage_pmd_nr / 2; + struct settings settings = *current_settings(); + void *p; + + settings.khugepaged.max_ptes_none = max_ptes_none; + push_settings(&settings); + + p = ops->setup_area(1); + + if (is_tmpfs(ops)) { + /* shmem pages always in the page cache */ + printf("tmpfs..."); + skip("Skip"); + goto skip; + } + + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1, + ops, !c->enforce_pte_scan_limits); + validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + + if (c->enforce_pte_scan_limits) { + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops, + true); + validate_memory(p, 0, + (hpage_pmd_nr - max_ptes_none) * page_size); + } +skip: + ops->cleanup_area(p, hpage_pmd_size); + pop_settings(); +} + +static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); + + printf("Swapout one page..."); + if (madvise(p, page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + c->collapse("Collapse with swapping in single PTE entry", p, 1, ops, + true); + validate_memory(p, 0, hpage_pmd_size); +out: + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops) +{ + int max_ptes_swap = read_num("khugepaged/max_ptes_swap"); + void *p; + + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); + + printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); + if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, (max_ptes_swap + 1) * page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops, + !c->enforce_pte_scan_limits); + validate_memory(p, 0, hpage_pmd_size); + + if (c->enforce_pte_scan_limits) { + ops->fault(p, 0, hpage_pmd_size); + printf("Swapout %d of %d pages...", max_ptes_swap, + hpage_pmd_nr); + if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, max_ptes_swap * page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } + + c->collapse("Collapse with max_ptes_swap pages swapped out", p, + 1, ops, true); + validate_memory(p, 0, hpage_pmd_size); + } +out: + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + + p = alloc_hpage(ops); + + if (is_tmpfs(ops)) { + /* MADV_DONTNEED won't evict tmpfs pages */ + printf("tmpfs..."); + skip("Skip"); + goto skip; + } + + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + printf("Split huge page leaving single PTE mapping compound page..."); + madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + c->collapse("Collapse PTE table with single PTE mapping compound page", + p, 1, ops, true); + validate_memory(p, 0, page_size); +skip: + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + + p = alloc_hpage(ops); + printf("Split huge page leaving single PTE page table full of compound pages..."); + madvise(p, page_size, MADV_NOHUGEPAGE); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + c->collapse("Collapse PTE table full of compound pages", p, 1, ops, + true); + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops) +{ + void *p; + int i; + + p = ops->setup_area(1); + for (i = 0; i < hpage_pmd_nr; i++) { + printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", + i + 1, hpage_pmd_nr); + + madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); + ops->fault(BASE_ADDR, 0, hpage_pmd_size); + if (!ops->check_huge(BASE_ADDR, 1)) { + printf("Failed to allocate huge page\n"); + exit(EXIT_FAILURE); + } + madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE); + + p = mremap(BASE_ADDR - i * page_size, + i * page_size + hpage_pmd_size, + (i + 1) * page_size, + MREMAP_MAYMOVE | MREMAP_FIXED, + BASE_ADDR + 2 * hpage_pmd_size); + if (p == MAP_FAILED) { + perror("mremap+unmap"); + exit(EXIT_FAILURE); + } + + p = mremap(BASE_ADDR + 2 * hpage_pmd_size, + (i + 1) * page_size, + (i + 1) * page_size + hpage_pmd_size, + MREMAP_MAYMOVE | MREMAP_FIXED, + BASE_ADDR - (i + 1) * page_size); + if (p == MAP_FAILED) { + perror("mremap+alloc"); + exit(EXIT_FAILURE); + } + } + + ops->cleanup_area(BASE_ADDR, hpage_pmd_size); + ops->fault(p, 0, hpage_pmd_size); + if (!ops->check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + + c->collapse("Collapse PTE table full of different compound pages", p, 1, + ops, true); + + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_fork(struct collapse_context *c, struct mem_ops *ops) +{ + int wstatus; + void *p; + + p = ops->setup_area(1); + + printf("Allocate small page..."); + ops->fault(p, 0, page_size); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + printf("Share small page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + ops->fault(p, page_size, 2 * page_size); + c->collapse("Collapse PTE table with single page shared with parent process", + p, 1, ops, true); + + validate_memory(p, 0, page_size); + ops->cleanup_area(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has small page..."); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, page_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops) +{ + int wstatus; + void *p; + + p = alloc_hpage(ops); + printf("Share huge page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (ops->check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + + printf("Split huge page PMD in child process..."); + madvise(p, page_size, MADV_NOHUGEPAGE); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + ops->fault(p, 0, page_size); + + write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); + c->collapse("Collapse PTE table full of compound pages in child", + p, 1, ops, true); + write_num("khugepaged/max_ptes_shared", + current_settings()->khugepaged.max_ptes_shared); + + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has huge page..."); + if (ops->check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops) +{ + int max_ptes_shared = read_num("khugepaged/max_ptes_shared"); + int wstatus; + void *p; + + p = alloc_hpage(ops); + printf("Share huge page over fork()..."); + if (!fork()) { + /* Do not touch settings on child exit */ + skip_settings_restore = true; + exit_status = 0; + + if (ops->check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + + printf("Trigger CoW on page %d of %d...", + hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + c->collapse("Maybe collapse with max_ptes_shared exceeded", p, + 1, ops, !c->enforce_pte_scan_limits); + + if (c->enforce_pte_scan_limits) { + printf("Trigger CoW on page %d of %d...", + hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) * + page_size); + if (ops->check_huge(p, 0)) + success("OK"); + else + fail("Fail"); + + c->collapse("Collapse with max_ptes_shared PTEs shared", + p, 1, ops, true); + } + + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); + exit(exit_status); + } + + wait(&wstatus); + exit_status += WEXITSTATUS(wstatus); + + printf("Check if parent still has huge page..."); + if (ops->check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +static void madvise_collapse_existing_thps(struct collapse_context *c, + struct mem_ops *ops) +{ + void *p; + + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); + c->collapse("Collapse fully populated PTE table...", p, 1, ops, true); + validate_memory(p, 0, hpage_pmd_size); + + /* c->collapse() will find a hugepage and complain - call directly. */ + __madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true); + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); +} + +/* + * Test race with khugepaged where page tables have been retracted and + * pmd cleared. + */ +static void madvise_retracted_page_tables(struct collapse_context *c, + struct mem_ops *ops) +{ + void *p; + int nr_hpages = 1; + unsigned long size = nr_hpages * hpage_pmd_size; + + p = ops->setup_area(nr_hpages); + ops->fault(p, 0, size); + + /* Let khugepaged collapse and leave pmd cleared */ + if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages, + ops)) { + fail("Timeout"); + return; + } + success("OK"); + c->collapse("Install huge PMD from page cache", p, nr_hpages, ops, + true); + validate_memory(p, 0, size); + ops->cleanup_area(p, size); +} + +static void usage(void) +{ + fprintf(stderr, "\nUsage: ./khugepaged <test type> [dir]\n\n"); + fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n"); + fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n"); + fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n"); + fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n"); + fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n"); + fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n"); + fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n"); + fprintf(stderr, "\tmounted with huge=madvise option for khugepaged tests to work\n"); + exit(1); +} + +static void parse_test_type(int argc, const char **argv) +{ + char *buf; + const char *token; + + if (argc == 1) { + /* Backwards compatibility */ + khugepaged_context = &__khugepaged_context; + madvise_context = &__madvise_context; + anon_ops = &__anon_ops; + return; + } + + buf = strdup(argv[1]); + token = strsep(&buf, ":"); + + if (!strcmp(token, "all")) { + khugepaged_context = &__khugepaged_context; + madvise_context = &__madvise_context; + } else if (!strcmp(token, "khugepaged")) { + khugepaged_context = &__khugepaged_context; + } else if (!strcmp(token, "madvise")) { + madvise_context = &__madvise_context; + } else { + usage(); + } + + if (!buf) + usage(); + + if (!strcmp(buf, "all")) { + file_ops = &__file_ops; + anon_ops = &__anon_ops; + shmem_ops = &__shmem_ops; + } else if (!strcmp(buf, "anon")) { + anon_ops = &__anon_ops; + } else if (!strcmp(buf, "file")) { + file_ops = &__file_ops; + } else if (!strcmp(buf, "shmem")) { + shmem_ops = &__shmem_ops; + } else { + usage(); + } + + if (!file_ops) + return; + + if (argc != 3) + usage(); +} + +int main(int argc, const char **argv) +{ + struct settings default_settings = { + .thp_enabled = THP_MADVISE, + .thp_defrag = THP_DEFRAG_ALWAYS, + .shmem_enabled = SHMEM_ADVISE, + .use_zero_page = 0, + .khugepaged = { + .defrag = 1, + .alloc_sleep_millisecs = 10, + .scan_sleep_millisecs = 10, + }, + /* + * When testing file-backed memory, the collapse path + * looks at how many pages are found in the page cache, not + * what pages are mapped. Disable read ahead optimization so + * pages don't find their way into the page cache unless + * we mem_ops->fault() them in. + */ + .read_ahead_kb = 0, + }; + + parse_test_type(argc, argv); + + if (file_ops) + get_finfo(argv[2]); + + setbuf(stdout, NULL); + + page_size = getpagesize(); + hpage_pmd_size = read_pmd_pagesize(); + hpage_pmd_nr = hpage_pmd_size / page_size; + + default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; + default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8; + default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2; + default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; + + save_settings(); + push_settings(&default_settings); + + alloc_at_fault(); + +#define TEST(t, c, o) do { \ + if (c && o) { \ + printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \ + t(c, o); \ + } \ + } while (0) + + TEST(collapse_full, khugepaged_context, anon_ops); + TEST(collapse_full, khugepaged_context, file_ops); + TEST(collapse_full, khugepaged_context, shmem_ops); + TEST(collapse_full, madvise_context, anon_ops); + TEST(collapse_full, madvise_context, file_ops); + TEST(collapse_full, madvise_context, shmem_ops); + + TEST(collapse_empty, khugepaged_context, anon_ops); + TEST(collapse_empty, madvise_context, anon_ops); + + TEST(collapse_single_pte_entry, khugepaged_context, anon_ops); + TEST(collapse_single_pte_entry, khugepaged_context, file_ops); + TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops); + TEST(collapse_single_pte_entry, madvise_context, anon_ops); + TEST(collapse_single_pte_entry, madvise_context, file_ops); + TEST(collapse_single_pte_entry, madvise_context, shmem_ops); + + TEST(collapse_max_ptes_none, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_none, khugepaged_context, file_ops); + TEST(collapse_max_ptes_none, madvise_context, anon_ops); + TEST(collapse_max_ptes_none, madvise_context, file_ops); + + TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops); + TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops); + TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops); + TEST(collapse_single_pte_entry_compound, madvise_context, file_ops); + + TEST(collapse_full_of_compound, khugepaged_context, anon_ops); + TEST(collapse_full_of_compound, khugepaged_context, file_ops); + TEST(collapse_full_of_compound, khugepaged_context, shmem_ops); + TEST(collapse_full_of_compound, madvise_context, anon_ops); + TEST(collapse_full_of_compound, madvise_context, file_ops); + TEST(collapse_full_of_compound, madvise_context, shmem_ops); + + TEST(collapse_compound_extreme, khugepaged_context, anon_ops); + TEST(collapse_compound_extreme, madvise_context, anon_ops); + + TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops); + TEST(collapse_swapin_single_pte, madvise_context, anon_ops); + + TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_swap, madvise_context, anon_ops); + + TEST(collapse_fork, khugepaged_context, anon_ops); + TEST(collapse_fork, madvise_context, anon_ops); + + TEST(collapse_fork_compound, khugepaged_context, anon_ops); + TEST(collapse_fork_compound, madvise_context, anon_ops); + + TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_shared, madvise_context, anon_ops); + + TEST(madvise_collapse_existing_thps, madvise_context, anon_ops); + TEST(madvise_collapse_existing_thps, madvise_context, file_ops); + TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops); + + TEST(madvise_retracted_page_tables, madvise_context, file_ops); + TEST(madvise_retracted_page_tables, madvise_context, shmem_ops); + + restore_settings(0); +} --- /dev/null +++ a/tools/testing/selftests/mm/ksm_functional_tests.c @@ -0,0 +1,374 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KSM functional tests + * + * Copyright 2022, Red Hat, Inc. + * + * Author(s): David Hildenbrand <david@xxxxxxxxxx> + */ +#define _GNU_SOURCE +#include <stdlib.h> +#include <string.h> +#include <stdbool.h> +#include <stdint.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include <sys/ioctl.h> +#include <linux/userfaultfd.h> + +#include "../kselftest.h" +#include "vm_util.h" + +#define KiB 1024u +#define MiB (1024 * KiB) + +static int ksm_fd; +static int ksm_full_scans_fd; +static int ksm_zero_pages_fd; +static int ksm_use_zero_pages_fd; +static int pagemap_fd; +static size_t pagesize; + +static bool range_maps_duplicates(char *addr, unsigned long size) +{ + unsigned long offs_a, offs_b, pfn_a, pfn_b; + + /* + * There is no easy way to check if there are KSM pages mapped into + * this range. We only check that the range does not map the same PFN + * twice by comparing each pair of mapped pages. + */ + for (offs_a = 0; offs_a < size; offs_a += pagesize) { + pfn_a = pagemap_get_pfn(pagemap_fd, addr + offs_a); + /* Page not present or PFN not exposed by the kernel. */ + if (pfn_a == -1ul || !pfn_a) + continue; + + for (offs_b = offs_a + pagesize; offs_b < size; + offs_b += pagesize) { + pfn_b = pagemap_get_pfn(pagemap_fd, addr + offs_b); + if (pfn_b == -1ul || !pfn_b) + continue; + if (pfn_a == pfn_b) + return true; + } + } + return false; +} + +static bool check_ksm_zero_pages_count(unsigned long zero_size) +{ + unsigned long pages_expected = zero_size / (4 * KiB); + char buf[20]; + ssize_t read_size; + unsigned long ksm_zero_pages; + + read_size = pread(ksm_zero_pages_fd, buf, sizeof(buf) - 1, 0); + if (read_size < 0) + return -errno; + buf[read_size] = 0; + ksm_zero_pages = strtol(buf, NULL, 10); + + return ksm_zero_pages == pages_expected; +} + +static long ksm_get_full_scans(void) +{ + char buf[10]; + ssize_t ret; + + ret = pread(ksm_full_scans_fd, buf, sizeof(buf) - 1, 0); + if (ret <= 0) + return -errno; + buf[ret] = 0; + + return strtol(buf, NULL, 10); +} + +static int wait_two_full_scans(void) +{ + long start_scans, end_scans; + + start_scans = ksm_get_full_scans(); + if (start_scans < 0) + return -errno; + do { + end_scans = ksm_get_full_scans(); + if (end_scans < 0) + return end_scans; + } while (end_scans < start_scans + 2); + + return 0; +} + +static inline int ksm_merge(void) +{ + /* Wait for two full scans such that any possible merging happened. */ + if (write(ksm_fd, "1", 1) != 1) + return -errno; + return wait_two_full_scans(); +} + +static inline int make_cow(char *map, char val, unsigned long size) +{ + + memset(map, val, size); + return wait_two_full_scans(); +} + +static int unmerge_zero_page(char *start, unsigned long size) +{ + int ret; + + ret = madvise(start, size, MADV_UNMERGEABLE); + if (ret) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + return ret; + } + + return wait_two_full_scans(); +} + +static char *mmap_and_merge_range(char val, unsigned long size) +{ + char *map; + + map = mmap(NULL, size, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANON, -1, 0); + if (map == MAP_FAILED) { + ksft_test_result_fail("mmap() failed\n"); + return MAP_FAILED; + } + + /* Don't use THP. Ignore if THP are not around on a kernel. */ + if (madvise(map, size, MADV_NOHUGEPAGE) && errno != EINVAL) { + ksft_test_result_fail("MADV_NOHUGEPAGE failed\n"); + goto unmap; + } + + /* Make sure each page contains the same values to merge them. */ + memset(map, val, size); + if (madvise(map, size, MADV_MERGEABLE)) { + ksft_test_result_fail("MADV_MERGEABLE failed\n"); + goto unmap; + } + + /* Run KSM to trigger merging and wait. */ + if (ksm_merge()) { + ksft_test_result_fail("Running KSM failed\n"); + goto unmap; + } + return map; +unmap: + munmap(map, size); + return MAP_FAILED; +} + +static void test_unmerge(void) +{ + const unsigned int size = 2 * MiB; + char *map; + + ksft_print_msg("[RUN] %s\n", __func__); + + map = mmap_and_merge_range(0xcf, size); + if (map == MAP_FAILED) + return; + + if (madvise(map, size, MADV_UNMERGEABLE)) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + goto unmap; + } + + ksft_test_result(!range_maps_duplicates(map, size), + "Pages were unmerged\n"); +unmap: + munmap(map, size); +} + +static void test_unmerge_zero_pages(void) +{ + const unsigned int size = 2 * MiB; + char *map; + + ksft_print_msg("[RUN] %s\n", __func__); + + /* Confirm the interfaces*/ + ksm_zero_pages_fd = open("/sys/kernel/mm/ksm/zero_pages_sharing", O_RDONLY); + if (ksm_zero_pages_fd < 0) { + ksft_test_result_skip("open(\"/sys/kernel/mm/ksm/zero_pages_sharing\") failed\n"); + return; + } + ksm_use_zero_pages_fd = open("/sys/kernel/mm/ksm/use_zero_pages", O_RDWR); + if (ksm_use_zero_pages_fd < 0) { + ksft_test_result_skip("open \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n"); + return; + } + if (write(ksm_use_zero_pages_fd, "1", 1) != 1) { + ksft_test_result_skip("write \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n"); + return; + } + + /* Mmap zero pages*/ + map = mmap_and_merge_range(0x00, size); + + /* Case 1: make Writing on ksm zero pages (COW) */ + if (make_cow(map, 0xcf, size / 2)) { + ksft_test_result_fail("COW failed\n"); + goto unmap; + } + ksft_test_result(check_ksm_zero_pages_count(size / 2), + "zero page count react to cow\n"); + + /* Case 2: Call madvise(xxx, MADV_UNMERGEABLE)*/ + if (unmerge_zero_page(map + size / 2, size / 4)) { + ksft_test_result_fail("unmerge_zero_page failed\n"); + goto unmap; + } + ksft_test_result(check_ksm_zero_pages_count(size / 4), + "zero page count react to unmerge\n"); + + /*Check if ksm pages are really unmerged */ + ksft_test_result(!range_maps_duplicates(map + size / 2, size / 4), + "KSM zero pages were unmerged\n"); + +unmap: + munmap(map, size); +} + +static void test_unmerge_discarded(void) +{ + const unsigned int size = 2 * MiB; + char *map; + + ksft_print_msg("[RUN] %s\n", __func__); + + map = mmap_and_merge_range(0xcf, size); + if (map == MAP_FAILED) + return; + + /* Discard half of all mapped pages so we have pte_none() entries. */ + if (madvise(map, size / 2, MADV_DONTNEED)) { + ksft_test_result_fail("MADV_DONTNEED failed\n"); + goto unmap; + } + + if (madvise(map, size, MADV_UNMERGEABLE)) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + goto unmap; + } + + ksft_test_result(!range_maps_duplicates(map, size), + "Pages were unmerged\n"); +unmap: + munmap(map, size); +} + +#ifdef __NR_userfaultfd +static void test_unmerge_uffd_wp(void) +{ + struct uffdio_writeprotect uffd_writeprotect; + struct uffdio_register uffdio_register; + const unsigned int size = 2 * MiB; + struct uffdio_api uffdio_api; + char *map; + int uffd; + + ksft_print_msg("[RUN] %s\n", __func__); + + map = mmap_and_merge_range(0xcf, size); + if (map == MAP_FAILED) + return; + + /* See if UFFD is around. */ + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd < 0) { + ksft_test_result_skip("__NR_userfaultfd failed\n"); + goto unmap; + } + + /* See if UFFD-WP is around. */ + uffdio_api.api = UFFD_API; + uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP; + if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) { + ksft_test_result_fail("UFFDIO_API failed\n"); + goto close_uffd; + } + if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) { + ksft_test_result_skip("UFFD_FEATURE_PAGEFAULT_FLAG_WP not available\n"); + goto close_uffd; + } + + /* Register UFFD-WP, no need for an actual handler. */ + uffdio_register.range.start = (unsigned long) map; + uffdio_register.range.len = size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) { + ksft_test_result_fail("UFFDIO_REGISTER_MODE_WP failed\n"); + goto close_uffd; + } + + /* Write-protect the range using UFFD-WP. */ + uffd_writeprotect.range.start = (unsigned long) map; + uffd_writeprotect.range.len = size; + uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP; + if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { + ksft_test_result_fail("UFFDIO_WRITEPROTECT failed\n"); + goto close_uffd; + } + + if (madvise(map, size, MADV_UNMERGEABLE)) { + ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); + goto close_uffd; + } + + ksft_test_result(!range_maps_duplicates(map, size), + "Pages were unmerged\n"); +close_uffd: + close(uffd); +unmap: + munmap(map, size); +} +#endif + +int main(int argc, char **argv) +{ + unsigned int tests = 2; + int err; + +#ifdef __NR_userfaultfd + tests++; +#endif + + ksft_print_header(); + ksft_set_plan(tests); + + pagesize = getpagesize(); + + ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); + if (ksm_fd < 0) + ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n"); + ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY); + if (ksm_full_scans_fd < 0) + ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n"); + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); + + test_unmerge(); + test_unmerge_zero_pages(); + test_unmerge_discarded(); +#ifdef __NR_userfaultfd + test_unmerge_uffd_wp(); +#endif + + err = ksft_get_fail_cnt(); + if (err) + ksft_exit_fail_msg("%d out of %d tests failed\n", + err, ksft_test_num()); + return ksft_exit_pass(); +} --- /dev/null +++ a/tools/testing/selftests/mm/ksm_tests.c @@ -0,0 +1,849 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <sys/mman.h> +#include <stdbool.h> +#include <time.h> +#include <string.h> +#include <numa.h> +#include <unistd.h> +#include <fcntl.h> +#include <stdint.h> +#include <err.h> + +#include "../kselftest.h" +#include <include/vdso/time64.h> +#include "util.h" + +#define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/" +#define KSM_FP(s) (KSM_SYSFS_PATH s) +#define KSM_SCAN_LIMIT_SEC_DEFAULT 120 +#define KSM_PAGE_COUNT_DEFAULT 10l +#define KSM_PROT_STR_DEFAULT "rw" +#define KSM_USE_ZERO_PAGES_DEFAULT false +#define KSM_MERGE_ACROSS_NODES_DEFAULT true +#define MB (1ul << 20) + +struct ksm_sysfs { + unsigned long max_page_sharing; + unsigned long merge_across_nodes; + unsigned long pages_to_scan; + unsigned long run; + unsigned long sleep_millisecs; + unsigned long stable_node_chains_prune_millisecs; + unsigned long use_zero_pages; +}; + +enum ksm_test_name { + CHECK_KSM_MERGE, + CHECK_KSM_UNMERGE, + CHECK_KSM_ZERO_PAGE_MERGE, + CHECK_KSM_NUMA_MERGE, + KSM_MERGE_TIME, + KSM_MERGE_TIME_HUGE_PAGES, + KSM_UNMERGE_TIME, + KSM_COW_TIME +}; + +static int ksm_write_sysfs(const char *file_path, unsigned long val) +{ + FILE *f = fopen(file_path, "w"); + + if (!f) { + fprintf(stderr, "f %s\n", file_path); + perror("fopen"); + return 1; + } + if (fprintf(f, "%lu", val) < 0) { + perror("fprintf"); + fclose(f); + return 1; + } + fclose(f); + + return 0; +} + +static int ksm_read_sysfs(const char *file_path, unsigned long *val) +{ + FILE *f = fopen(file_path, "r"); + + if (!f) { + fprintf(stderr, "f %s\n", file_path); + perror("fopen"); + return 1; + } + if (fscanf(f, "%lu", val) != 1) { + perror("fscanf"); + fclose(f); + return 1; + } + fclose(f); + + return 0; +} + +static int str_to_prot(char *prot_str) +{ + int prot = 0; + + if ((strchr(prot_str, 'r')) != NULL) + prot |= PROT_READ; + if ((strchr(prot_str, 'w')) != NULL) + prot |= PROT_WRITE; + if ((strchr(prot_str, 'x')) != NULL) + prot |= PROT_EXEC; + + return prot; +} + +static void print_help(void) +{ + printf("usage: ksm_tests [-h] <test type> [-a prot] [-p page_count] [-l timeout]\n" + "[-z use_zero_pages] [-m merge_across_nodes] [-s size]\n"); + + printf("Supported <test type>:\n" + " -M (page merging)\n" + " -Z (zero pages merging)\n" + " -N (merging of pages in different NUMA nodes)\n" + " -U (page unmerging)\n" + " -P evaluate merging time and speed.\n" + " For this test, the size of duplicated memory area (in MiB)\n" + " must be provided using -s option\n" + " -H evaluate merging time and speed of area allocated mostly with huge pages\n" + " For this test, the size of duplicated memory area (in MiB)\n" + " must be provided using -s option\n" + " -D evaluate unmerging time and speed when disabling KSM.\n" + " For this test, the size of duplicated memory area (in MiB)\n" + " must be provided using -s option\n" + " -C evaluate the time required to break COW of merged pages.\n\n"); + + printf(" -a: specify the access protections of pages.\n" + " <prot> must be of the form [rwx].\n" + " Default: %s\n", KSM_PROT_STR_DEFAULT); + printf(" -p: specify the number of pages to test.\n" + " Default: %ld\n", KSM_PAGE_COUNT_DEFAULT); + printf(" -l: limit the maximum running time (in seconds) for a test.\n" + " Default: %d seconds\n", KSM_SCAN_LIMIT_SEC_DEFAULT); + printf(" -z: change use_zero_pages tunable\n" + " Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT); + printf(" -m: change merge_across_nodes tunable\n" + " Default: %d\n", KSM_MERGE_ACROSS_NODES_DEFAULT); + printf(" -s: the size of duplicated memory area (in MiB)\n"); + + exit(0); +} + +static void *allocate_memory(void *ptr, int prot, int mapping, char data, size_t map_size) +{ + void *map_ptr = mmap(ptr, map_size, PROT_WRITE, mapping, -1, 0); + + if (!map_ptr) { + perror("mmap"); + return NULL; + } + memset(map_ptr, data, map_size); + if (mprotect(map_ptr, map_size, prot)) { + perror("mprotect"); + munmap(map_ptr, map_size); + return NULL; + } + + return map_ptr; +} + +static int ksm_do_scan(int scan_count, struct timespec start_time, int timeout) +{ + struct timespec cur_time; + unsigned long cur_scan, init_scan; + + if (ksm_read_sysfs(KSM_FP("full_scans"), &init_scan)) + return 1; + cur_scan = init_scan; + + while (cur_scan < init_scan + scan_count) { + if (ksm_read_sysfs(KSM_FP("full_scans"), &cur_scan)) + return 1; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &cur_time)) { + perror("clock_gettime"); + return 1; + } + if ((cur_time.tv_sec - start_time.tv_sec) > timeout) { + printf("Scan time limit exceeded\n"); + return 1; + } + } + + return 0; +} + +static int ksm_merge_pages(void *addr, size_t size, struct timespec start_time, int timeout) +{ + if (madvise(addr, size, MADV_MERGEABLE)) { + perror("madvise"); + return 1; + } + if (ksm_write_sysfs(KSM_FP("run"), 1)) + return 1; + + /* Since merging occurs only after 2 scans, make sure to get at least 2 full scans */ + if (ksm_do_scan(2, start_time, timeout)) + return 1; + + return 0; +} + +static int ksm_unmerge_pages(void *addr, size_t size, + struct timespec start_time, int timeout) +{ + if (madvise(addr, size, MADV_UNMERGEABLE)) { + perror("madvise"); + return 1; + } + return 0; +} + +static bool assert_ksm_pages_count(long dupl_page_count) +{ + unsigned long max_page_sharing, pages_sharing, pages_shared; + + if (ksm_read_sysfs(KSM_FP("pages_shared"), &pages_shared) || + ksm_read_sysfs(KSM_FP("pages_sharing"), &pages_sharing) || + ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing)) + return false; + + /* + * Since there must be at least 2 pages for merging and 1 page can be + * shared with the limited number of pages (max_page_sharing), sometimes + * there are 'leftover' pages that cannot be merged. For example, if there + * are 11 pages and max_page_sharing = 10, then only 10 pages will be + * merged and the 11th page won't be affected. As a result, when the number + * of duplicate pages is divided by max_page_sharing and the remainder is 1, + * pages_shared and pages_sharing values will be equal between dupl_page_count + * and dupl_page_count - 1. + */ + if (dupl_page_count % max_page_sharing == 1 || dupl_page_count % max_page_sharing == 0) { + if (pages_shared == dupl_page_count / max_page_sharing && + pages_sharing == pages_shared * (max_page_sharing - 1)) + return true; + } else { + if (pages_shared == (dupl_page_count / max_page_sharing + 1) && + pages_sharing == dupl_page_count - pages_shared) + return true; + } + + return false; +} + +static int ksm_save_def(struct ksm_sysfs *ksm_sysfs) +{ + if (ksm_read_sysfs(KSM_FP("max_page_sharing"), &ksm_sysfs->max_page_sharing) || + numa_available() ? 0 : + ksm_read_sysfs(KSM_FP("merge_across_nodes"), &ksm_sysfs->merge_across_nodes) || + ksm_read_sysfs(KSM_FP("sleep_millisecs"), &ksm_sysfs->sleep_millisecs) || + ksm_read_sysfs(KSM_FP("pages_to_scan"), &ksm_sysfs->pages_to_scan) || + ksm_read_sysfs(KSM_FP("run"), &ksm_sysfs->run) || + ksm_read_sysfs(KSM_FP("stable_node_chains_prune_millisecs"), + &ksm_sysfs->stable_node_chains_prune_millisecs) || + ksm_read_sysfs(KSM_FP("use_zero_pages"), &ksm_sysfs->use_zero_pages)) + return 1; + + return 0; +} + +static int ksm_restore(struct ksm_sysfs *ksm_sysfs) +{ + if (ksm_write_sysfs(KSM_FP("max_page_sharing"), ksm_sysfs->max_page_sharing) || + numa_available() ? 0 : + ksm_write_sysfs(KSM_FP("merge_across_nodes"), ksm_sysfs->merge_across_nodes) || + ksm_write_sysfs(KSM_FP("pages_to_scan"), ksm_sysfs->pages_to_scan) || + ksm_write_sysfs(KSM_FP("run"), ksm_sysfs->run) || + ksm_write_sysfs(KSM_FP("sleep_millisecs"), ksm_sysfs->sleep_millisecs) || + ksm_write_sysfs(KSM_FP("stable_node_chains_prune_millisecs"), + ksm_sysfs->stable_node_chains_prune_millisecs) || + ksm_write_sysfs(KSM_FP("use_zero_pages"), ksm_sysfs->use_zero_pages)) + return 1; + + return 0; +} + +static int check_ksm_merge(int mapping, int prot, long page_count, int timeout, size_t page_size) +{ + void *map_ptr; + struct timespec start_time; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + /* fill pages with the same data and merge them */ + map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + /* verify that the right number of pages are merged */ + if (assert_ksm_pages_count(page_count)) { + printf("OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + } + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + +static int check_ksm_unmerge(int mapping, int prot, int timeout, size_t page_size) +{ + void *map_ptr; + struct timespec start_time; + int page_count = 2; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + /* fill pages with the same data and merge them */ + map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + /* change 1 byte in each of the 2 pages -- KSM must automatically unmerge them */ + memset(map_ptr, '-', 1); + memset(map_ptr + page_size, '+', 1); + + /* get at least 1 scan, so KSM can detect that the pages were modified */ + if (ksm_do_scan(1, start_time, timeout)) + goto err_out; + + /* check that unmerging was successful and 0 pages are currently merged */ + if (assert_ksm_pages_count(0)) { + printf("OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + } + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + +static int check_ksm_zero_page_merge(int mapping, int prot, long page_count, int timeout, + bool use_zero_pages, size_t page_size) +{ + void *map_ptr; + struct timespec start_time; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + if (ksm_write_sysfs(KSM_FP("use_zero_pages"), use_zero_pages)) + return KSFT_FAIL; + + /* fill pages with zero and try to merge them */ + map_ptr = allocate_memory(NULL, prot, mapping, 0, page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + /* + * verify that the right number of pages are merged: + * 1) if use_zero_pages is set to 1, empty pages are merged + * with the kernel zero page instead of with each other; + * 2) if use_zero_pages is set to 0, empty pages are not treated specially + * and merged as usual. + */ + if (use_zero_pages && !assert_ksm_pages_count(0)) + goto err_out; + else if (!use_zero_pages && !assert_ksm_pages_count(page_count)) + goto err_out; + + printf("OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + +static int get_next_mem_node(int node) +{ + + long node_size; + int mem_node = 0; + int i, max_node = numa_max_node(); + + for (i = node + 1; i <= max_node + node; i++) { + mem_node = i % (max_node + 1); + node_size = numa_node_size(mem_node, NULL); + if (node_size > 0) + break; + } + return mem_node; +} + +static int get_first_mem_node(void) +{ + return get_next_mem_node(numa_max_node()); +} + +static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_across_nodes, + size_t page_size) +{ + void *numa1_map_ptr, *numa2_map_ptr; + struct timespec start_time; + int page_count = 2; + int first_node; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + if (numa_available() < 0) { + perror("NUMA support not enabled"); + return KSFT_SKIP; + } + if (numa_num_configured_nodes() <= 1) { + printf("At least 2 NUMA nodes must be available\n"); + return KSFT_SKIP; + } + if (ksm_write_sysfs(KSM_FP("merge_across_nodes"), merge_across_nodes)) + return KSFT_FAIL; + + /* allocate 2 pages in 2 different NUMA nodes and fill them with the same data */ + first_node = get_first_mem_node(); + numa1_map_ptr = numa_alloc_onnode(page_size, first_node); + numa2_map_ptr = numa_alloc_onnode(page_size, get_next_mem_node(first_node)); + if (!numa1_map_ptr || !numa2_map_ptr) { + perror("numa_alloc_onnode"); + return KSFT_FAIL; + } + + memset(numa1_map_ptr, '*', page_size); + memset(numa2_map_ptr, '*', page_size); + + /* try to merge the pages */ + if (ksm_merge_pages(numa1_map_ptr, page_size, start_time, timeout) || + ksm_merge_pages(numa2_map_ptr, page_size, start_time, timeout)) + goto err_out; + + /* + * verify that the right number of pages are merged: + * 1) if merge_across_nodes was enabled, 2 duplicate pages will be merged; + * 2) if merge_across_nodes = 0, there must be 0 merged pages, since there is + * only 1 unique page in each node and they can't be shared. + */ + if (merge_across_nodes && !assert_ksm_pages_count(page_count)) + goto err_out; + else if (!merge_across_nodes && !assert_ksm_pages_count(0)) + goto err_out; + + numa_free(numa1_map_ptr, page_size); + numa_free(numa2_map_ptr, page_size); + printf("OK\n"); + return KSFT_PASS; + +err_out: + numa_free(numa1_map_ptr, page_size); + numa_free(numa2_map_ptr, page_size); + printf("Not OK\n"); + return KSFT_FAIL; +} + +static int ksm_merge_hugepages_time(int mapping, int prot, int timeout, size_t map_size) +{ + void *map_ptr, *map_ptr_orig; + struct timespec start_time, end_time; + unsigned long scan_time_ns; + int pagemap_fd, n_normal_pages, n_huge_pages; + + map_size *= MB; + size_t len = map_size; + + len -= len % HPAGE_SIZE; + map_ptr_orig = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0); + map_ptr = map_ptr_orig + HPAGE_SIZE - (uintptr_t)map_ptr_orig % HPAGE_SIZE; + + if (map_ptr_orig == MAP_FAILED) + err(2, "initial mmap"); + + if (madvise(map_ptr, len + HPAGE_SIZE, MADV_HUGEPAGE)) + err(2, "MADV_HUGEPAGE"); + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + err(2, "open pagemap"); + + n_normal_pages = 0; + n_huge_pages = 0; + for (void *p = map_ptr; p < map_ptr + len; p += HPAGE_SIZE) { + if (allocate_transhuge(p, pagemap_fd) < 0) + n_normal_pages++; + else + n_huge_pages++; + } + printf("Number of normal pages: %d\n", n_normal_pages); + printf("Number of huge pages: %d\n", n_huge_pages); + + memset(map_ptr, '*', len); + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) + goto err_out; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n", map_size / MB); + printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, + scan_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", (map_size / MB) / + ((double)scan_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr_orig, len + HPAGE_SIZE); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr_orig, len + HPAGE_SIZE); + return KSFT_FAIL; +} + +static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size) +{ + void *map_ptr; + struct timespec start_time, end_time; + unsigned long scan_time_ns; + + map_size *= MB; + + map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size); + if (!map_ptr) + return KSFT_FAIL; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) + goto err_out; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n", map_size / MB); + printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, + scan_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", (map_size / MB) / + ((double)scan_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr, map_size); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, map_size); + return KSFT_FAIL; +} + +static int ksm_unmerge_time(int mapping, int prot, int timeout, size_t map_size) +{ + void *map_ptr; + struct timespec start_time, end_time; + unsigned long scan_time_ns; + + map_size *= MB; + + map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size); + if (!map_ptr) + return KSFT_FAIL; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) + goto err_out; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + if (ksm_unmerge_pages(map_ptr, map_size, start_time, timeout)) + goto err_out; + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n", map_size / MB); + printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, + scan_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", (map_size / MB) / + ((double)scan_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr, map_size); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, map_size); + return KSFT_FAIL; +} + +static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size) +{ + void *map_ptr; + struct timespec start_time, end_time; + unsigned long cow_time_ns; + + /* page_count must be less than 2*page_size */ + size_t page_count = 4000; + + map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); + if (!map_ptr) + return KSFT_FAIL; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + for (size_t i = 0; i < page_count - 1; i = i + 2) + memset(map_ptr + page_size * i, '-', 1); + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + return KSFT_FAIL; + } + + cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Total size: %lu MiB\n\n", (page_size * page_count) / MB); + printf("Not merged pages:\n"); + printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, + cow_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n\n", ((page_size * (page_count / 2)) / MB) / + ((double)cow_time_ns / NSEC_PER_SEC)); + + /* Create 2000 pairs of duplicate pages */ + for (size_t i = 0; i < page_count - 1; i = i + 2) { + memset(map_ptr + page_size * i, '+', i / 2 + 1); + memset(map_ptr + page_size * (i + 1), '+', i / 2 + 1); + } + if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) + goto err_out; + + if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { + perror("clock_gettime"); + goto err_out; + } + for (size_t i = 0; i < page_count - 1; i = i + 2) + memset(map_ptr + page_size * i, '-', 1); + if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { + perror("clock_gettime"); + goto err_out; + } + + cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + + (end_time.tv_nsec - start_time.tv_nsec); + + printf("Merged pages:\n"); + printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, + cow_time_ns % NSEC_PER_SEC); + printf("Average speed: %.3f MiB/s\n", ((page_size * (page_count / 2)) / MB) / + ((double)cow_time_ns / NSEC_PER_SEC)); + + munmap(map_ptr, page_size * page_count); + return KSFT_PASS; + +err_out: + printf("Not OK\n"); + munmap(map_ptr, page_size * page_count); + return KSFT_FAIL; +} + +int main(int argc, char *argv[]) +{ + int ret, opt; + int prot = 0; + int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT; + long page_count = KSM_PAGE_COUNT_DEFAULT; + size_t page_size = sysconf(_SC_PAGESIZE); + struct ksm_sysfs ksm_sysfs_old; + int test_name = CHECK_KSM_MERGE; + bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT; + bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT; + long size_MB = 0; + + while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCHD")) != -1) { + switch (opt) { + case 'a': + prot = str_to_prot(optarg); + break; + case 'p': + page_count = atol(optarg); + if (page_count <= 0) { + printf("The number of pages must be greater than 0\n"); + return KSFT_FAIL; + } + break; + case 'l': + ksm_scan_limit_sec = atoi(optarg); + if (ksm_scan_limit_sec <= 0) { + printf("Timeout value must be greater than 0\n"); + return KSFT_FAIL; + } + break; + case 'h': + print_help(); + break; + case 'z': + if (strcmp(optarg, "0") == 0) + use_zero_pages = 0; + else + use_zero_pages = 1; + break; + case 'm': + if (strcmp(optarg, "0") == 0) + merge_across_nodes = 0; + else + merge_across_nodes = 1; + break; + case 's': + size_MB = atoi(optarg); + if (size_MB <= 0) { + printf("Size must be greater than 0\n"); + return KSFT_FAIL; + } + case 'M': + break; + case 'U': + test_name = CHECK_KSM_UNMERGE; + break; + case 'Z': + test_name = CHECK_KSM_ZERO_PAGE_MERGE; + break; + case 'N': + test_name = CHECK_KSM_NUMA_MERGE; + break; + case 'P': + test_name = KSM_MERGE_TIME; + break; + case 'H': + test_name = KSM_MERGE_TIME_HUGE_PAGES; + break; + case 'D': + test_name = KSM_UNMERGE_TIME; + break; + case 'C': + test_name = KSM_COW_TIME; + break; + default: + return KSFT_FAIL; + } + } + + if (prot == 0) + prot = str_to_prot(KSM_PROT_STR_DEFAULT); + + if (access(KSM_SYSFS_PATH, F_OK)) { + printf("Config KSM not enabled\n"); + return KSFT_SKIP; + } + + if (ksm_save_def(&ksm_sysfs_old)) { + printf("Cannot save default tunables\n"); + return KSFT_FAIL; + } + + if (ksm_write_sysfs(KSM_FP("run"), 2) || + ksm_write_sysfs(KSM_FP("sleep_millisecs"), 0) || + numa_available() ? 0 : + ksm_write_sysfs(KSM_FP("merge_across_nodes"), 1) || + ksm_write_sysfs(KSM_FP("pages_to_scan"), page_count)) + return KSFT_FAIL; + + switch (test_name) { + case CHECK_KSM_MERGE: + ret = check_ksm_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, + ksm_scan_limit_sec, page_size); + break; + case CHECK_KSM_UNMERGE: + ret = check_ksm_unmerge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + page_size); + break; + case CHECK_KSM_ZERO_PAGE_MERGE: + ret = check_ksm_zero_page_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, + ksm_scan_limit_sec, use_zero_pages, page_size); + break; + case CHECK_KSM_NUMA_MERGE: + ret = check_ksm_numa_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + merge_across_nodes, page_size); + break; + case KSM_MERGE_TIME: + if (size_MB == 0) { + printf("Option '-s' is required.\n"); + return KSFT_FAIL; + } + ret = ksm_merge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + size_MB); + break; + case KSM_MERGE_TIME_HUGE_PAGES: + if (size_MB == 0) { + printf("Option '-s' is required.\n"); + return KSFT_FAIL; + } + ret = ksm_merge_hugepages_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, + ksm_scan_limit_sec, size_MB); + break; + case KSM_UNMERGE_TIME: + if (size_MB == 0) { + printf("Option '-s' is required.\n"); + return KSFT_FAIL; + } + ret = ksm_unmerge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, + ksm_scan_limit_sec, size_MB); + break; + case KSM_COW_TIME: + ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, + page_size); + break; + } + + if (ksm_restore(&ksm_sysfs_old)) { + printf("Cannot restore default tunables\n"); + return KSFT_FAIL; + } + + return ret; +} --- /dev/null +++ a/tools/testing/selftests/mm/madv_populate.c @@ -0,0 +1,296 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * MADV_POPULATE_READ and MADV_POPULATE_WRITE tests + * + * Copyright 2021, Red Hat, Inc. + * + * Author(s): David Hildenbrand <david@xxxxxxxxxx> + */ +#define _GNU_SOURCE +#include <stdlib.h> +#include <string.h> +#include <stdbool.h> +#include <stdint.h> +#include <unistd.h> +#include <errno.h> +#include <fcntl.h> +#include <linux/mman.h> +#include <sys/mman.h> + +#include "../kselftest.h" +#include "vm_util.h" + +#ifndef MADV_POPULATE_READ +#define MADV_POPULATE_READ 22 +#endif /* MADV_POPULATE_READ */ +#ifndef MADV_POPULATE_WRITE +#define MADV_POPULATE_WRITE 23 +#endif /* MADV_POPULATE_WRITE */ + +/* + * For now, we're using 2 MiB of private anonymous memory for all tests. + */ +#define SIZE (2 * 1024 * 1024) + +static size_t pagesize; + +static void sense_support(void) +{ + char *addr; + int ret; + + addr = mmap(0, pagesize, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (!addr) + ksft_exit_fail_msg("mmap failed\n"); + + ret = madvise(addr, pagesize, MADV_POPULATE_READ); + if (ret) + ksft_exit_skip("MADV_POPULATE_READ is not available\n"); + + ret = madvise(addr, pagesize, MADV_POPULATE_WRITE); + if (ret) + ksft_exit_skip("MADV_POPULATE_WRITE is not available\n"); + + munmap(addr, pagesize); +} + +static void test_prot_read(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(!ret, "MADV_POPULATE_READ with PROT_READ\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(ret == -1 && errno == EINVAL, + "MADV_POPULATE_WRITE with PROT_READ\n"); + + munmap(addr, SIZE); +} + +static void test_prot_write(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(ret == -1 && errno == EINVAL, + "MADV_POPULATE_READ with PROT_WRITE\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(!ret, "MADV_POPULATE_WRITE with PROT_WRITE\n"); + + munmap(addr, SIZE); +} + +static void test_holes(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + ret = munmap(addr + pagesize, pagesize); + if (ret) + ksft_exit_fail_msg("munmap failed\n"); + + /* Hole in the middle */ + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_READ with holes in the middle\n"); + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_WRITE with holes in the middle\n"); + + /* Hole at end */ + ret = madvise(addr, 2 * pagesize, MADV_POPULATE_READ); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_READ with holes at the end\n"); + ret = madvise(addr, 2 * pagesize, MADV_POPULATE_WRITE); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_WRITE with holes at the end\n"); + + /* Hole at beginning */ + ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_READ); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_READ with holes at the beginning\n"); + ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_WRITE); + ksft_test_result(ret == -1 && errno == ENOMEM, + "MADV_POPULATE_WRITE with holes at the beginning\n"); + + munmap(addr, SIZE); +} + +static bool range_is_populated(char *start, ssize_t size) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + bool ret = true; + + if (fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + for (; size > 0 && ret; size -= pagesize, start += pagesize) + if (!pagemap_is_populated(fd, start)) + ret = false; + close(fd); + return ret; +} + +static bool range_is_not_populated(char *start, ssize_t size) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + bool ret = true; + + if (fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + for (; size > 0 && ret; size -= pagesize, start += pagesize) + if (pagemap_is_populated(fd, start)) + ret = false; + close(fd); + return ret; +} + +static void test_populate_read(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + ksft_test_result(range_is_not_populated(addr, SIZE), + "range initially not populated\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(!ret, "MADV_POPULATE_READ\n"); + ksft_test_result(range_is_populated(addr, SIZE), + "range is populated\n"); + + munmap(addr, SIZE); +} + +static void test_populate_write(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + ksft_test_result(range_is_not_populated(addr, SIZE), + "range initially not populated\n"); + + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(!ret, "MADV_POPULATE_WRITE\n"); + ksft_test_result(range_is_populated(addr, SIZE), + "range is populated\n"); + + munmap(addr, SIZE); +} + +static bool range_is_softdirty(char *start, ssize_t size) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + bool ret = true; + + if (fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + for (; size > 0 && ret; size -= pagesize, start += pagesize) + if (!pagemap_is_softdirty(fd, start)) + ret = false; + close(fd); + return ret; +} + +static bool range_is_not_softdirty(char *start, ssize_t size) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + bool ret = true; + + if (fd < 0) + ksft_exit_fail_msg("opening pagemap failed\n"); + for (; size > 0 && ret; size -= pagesize, start += pagesize) + if (pagemap_is_softdirty(fd, start)) + ret = false; + close(fd); + return ret; +} + +static void test_softdirty(void) +{ + char *addr; + int ret; + + ksft_print_msg("[RUN] %s\n", __func__); + + addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); + if (addr == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + /* Clear any softdirty bits. */ + clear_softdirty(); + ksft_test_result(range_is_not_softdirty(addr, SIZE), + "range is not softdirty\n"); + + /* Populating READ should set softdirty. */ + ret = madvise(addr, SIZE, MADV_POPULATE_READ); + ksft_test_result(!ret, "MADV_POPULATE_READ\n"); + ksft_test_result(range_is_not_softdirty(addr, SIZE), + "range is not softdirty\n"); + + /* Populating WRITE should set softdirty. */ + ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); + ksft_test_result(!ret, "MADV_POPULATE_WRITE\n"); + ksft_test_result(range_is_softdirty(addr, SIZE), + "range is softdirty\n"); + + munmap(addr, SIZE); +} + +int main(int argc, char **argv) +{ + int err; + + pagesize = getpagesize(); + + ksft_print_header(); + ksft_set_plan(21); + + sense_support(); + test_prot_read(); + test_prot_write(); + test_holes(); + test_populate_read(); + test_populate_write(); + test_softdirty(); + + err = ksft_get_fail_cnt(); + if (err) + ksft_exit_fail_msg("%d out of %d tests failed\n", + err, ksft_test_num()); + return ksft_exit_pass(); +} --- /dev/null +++ a/tools/testing/selftests/mm/Makefile @@ -0,0 +1,180 @@ +# SPDX-License-Identifier: GPL-2.0 +# Makefile for vm selftests + +LOCAL_HDRS += $(selfdir)/vm/local_config.h $(top_srcdir)/mm/gup_test.h + +include local_config.mk + +uname_M := $(shell uname -m 2>/dev/null || echo not) +MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') + +# Without this, failed build products remain, with up-to-date timestamps, +# thus tricking Make (and you!) into believing that All Is Well, in subsequent +# make invocations: +.DELETE_ON_ERROR: + +# Avoid accidental wrong builds, due to built-in rules working just a little +# bit too well--but not quite as well as required for our situation here. +# +# In other words, "make userfaultfd" is supposed to fail to build at all, +# because this Makefile only supports either "make" (all), or "make /full/path". +# However, the built-in rules, if not suppressed, will pick up CFLAGS and the +# initial LDLIBS (but not the target-specific LDLIBS, because those are only +# set for the full path target!). This causes it to get pretty far into building +# things despite using incorrect values such as an *occasionally* incomplete +# LDLIBS. +MAKEFLAGS += --no-builtin-rules + +CFLAGS = -Wall -I $(top_srcdir) -I $(top_srcdir)/usr/include $(EXTRA_CFLAGS) $(KHDR_INCLUDES) +LDLIBS = -lrt -lpthread +TEST_GEN_FILES = cow +TEST_GEN_FILES += compaction_test +TEST_GEN_FILES += gup_test +TEST_GEN_FILES += hmm-tests +TEST_GEN_FILES += hugetlb-madvise +TEST_GEN_FILES += hugepage-mmap +TEST_GEN_FILES += hugepage-mremap +TEST_GEN_FILES += hugepage-shm +TEST_GEN_FILES += hugepage-vmemmap +TEST_GEN_FILES += khugepaged +TEST_GEN_PROGS = madv_populate +TEST_GEN_FILES += map_fixed_noreplace +TEST_GEN_FILES += map_hugetlb +TEST_GEN_FILES += map_populate +TEST_GEN_FILES += memfd_secret +TEST_GEN_FILES += migration +TEST_GEN_FILES += mlock-random-test +TEST_GEN_FILES += mlock2-tests +TEST_GEN_FILES += mrelease_test +TEST_GEN_FILES += mremap_dontunmap +TEST_GEN_FILES += mremap_test +TEST_GEN_FILES += on-fault-limit +TEST_GEN_FILES += thuge-gen +TEST_GEN_FILES += transhuge-stress +TEST_GEN_FILES += userfaultfd +TEST_GEN_PROGS += soft-dirty +TEST_GEN_PROGS += split_huge_page_test +TEST_GEN_FILES += ksm_tests +TEST_GEN_PROGS += ksm_functional_tests + +ifeq ($(MACHINE),x86_64) +CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32) +CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_program.c) +CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie) + +VMTARGETS := protection_keys +BINARIES_32 := $(VMTARGETS:%=%_32) +BINARIES_64 := $(VMTARGETS:%=%_64) + +ifeq ($(CAN_BUILD_WITH_NOPIE),1) +CFLAGS += -no-pie +endif + +ifeq ($(CAN_BUILD_I386),1) +TEST_GEN_FILES += $(BINARIES_32) +endif + +ifeq ($(CAN_BUILD_X86_64),1) +TEST_GEN_FILES += $(BINARIES_64) +endif +else + +ifneq (,$(findstring $(MACHINE),ppc64)) +TEST_GEN_FILES += protection_keys +endif + +endif + +ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sh64 sparc64 x86_64)) +TEST_GEN_FILES += va_128TBswitch +TEST_GEN_FILES += virtual_address_range +TEST_GEN_FILES += write_to_hugetlbfs +endif + +TEST_PROGS := run_vmtests.sh + +TEST_FILES := test_vmalloc.sh +TEST_FILES += test_hmm.sh +TEST_FILES += va_128TBswitch.sh + +include ../lib.mk + +$(OUTPUT)/cow: vm_util.c +$(OUTPUT)/khugepaged: vm_util.c +$(OUTPUT)/ksm_functional_tests: vm_util.c +$(OUTPUT)/madv_populate: vm_util.c +$(OUTPUT)/soft-dirty: vm_util.c +$(OUTPUT)/split_huge_page_test: vm_util.c +$(OUTPUT)/userfaultfd: vm_util.c + +ifeq ($(MACHINE),x86_64) +BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) +BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) + +define gen-target-rule-32 +$(1) $(1)_32: $(OUTPUT)/$(1)_32 +.PHONY: $(1) $(1)_32 +endef + +define gen-target-rule-64 +$(1) $(1)_64: $(OUTPUT)/$(1)_64 +.PHONY: $(1) $(1)_64 +endef + +ifeq ($(CAN_BUILD_I386),1) +$(BINARIES_32): CFLAGS += -m32 -mxsave +$(BINARIES_32): LDLIBS += -lrt -ldl -lm +$(BINARIES_32): $(OUTPUT)/%_32: %.c + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ +$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-32,$(t)))) +endif + +ifeq ($(CAN_BUILD_X86_64),1) +$(BINARIES_64): CFLAGS += -m64 -mxsave +$(BINARIES_64): LDLIBS += -lrt -ldl +$(BINARIES_64): $(OUTPUT)/%_64: %.c + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ +$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-64,$(t)))) +endif + +# x86_64 users should be encouraged to install 32-bit libraries +ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01) +all: warn_32bit_failure + +warn_32bit_failure: + @echo "Warning: you seem to have a broken 32-bit build" 2>&1; \ + echo "environment. This will reduce test coverage of 64-bit" 2>&1; \ + echo "kernels. If you are using a Debian-like distribution," 2>&1; \ + echo "try:"; 2>&1; \ + echo ""; \ + echo " apt-get install gcc-multilib libc6-i386 libc6-dev-i386"; \ + echo ""; \ + echo "If you are using a Fedora-like distribution, try:"; \ + echo ""; \ + echo " yum install glibc-devel.*i686"; \ + exit 0; +endif +endif + +# cow_EXTRA_LIBS may get set in local_config.mk, or it may be left empty. +$(OUTPUT)/cow: LDLIBS += $(COW_EXTRA_LIBS) + +$(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap + +$(OUTPUT)/ksm_tests: LDLIBS += -lnuma + +$(OUTPUT)/migration: LDLIBS += -lnuma + +local_config.mk local_config.h: check_config.sh + /bin/sh ./check_config.sh $(CC) + +EXTRA_CLEAN += local_config.mk local_config.h + +ifeq ($(COW_EXTRA_LIBS),) +all: warn_missing_liburing + +warn_missing_liburing: + @echo ; \ + echo "Warning: missing liburing support. Some COW tests will be skipped." ; \ + echo +endif --- /dev/null +++ a/tools/testing/selftests/mm/map_fixed_noreplace.c @@ -0,0 +1,231 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Test that MAP_FIXED_NOREPLACE works. + * + * Copyright 2018, Jann Horn <jannh@xxxxxxxxxx> + * Copyright 2018, Michael Ellerman, IBM Corporation. + */ + +#include <sys/mman.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#ifndef MAP_FIXED_NOREPLACE +#define MAP_FIXED_NOREPLACE 0x100000 +#endif + +static void dump_maps(void) +{ + char cmd[32]; + + snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid()); + system(cmd); +} + +static unsigned long find_base_addr(unsigned long size) +{ + void *addr; + unsigned long flags; + + flags = MAP_PRIVATE | MAP_ANONYMOUS; + addr = mmap(NULL, size, PROT_NONE, flags, -1, 0); + if (addr == MAP_FAILED) { + printf("Error: couldn't map the space we need for the test\n"); + return 0; + } + + if (munmap(addr, size) != 0) { + printf("Error: couldn't map the space we need for the test\n"); + return 0; + } + return (unsigned long)addr; +} + +int main(void) +{ + unsigned long base_addr; + unsigned long flags, addr, size, page_size; + char *p; + + page_size = sysconf(_SC_PAGE_SIZE); + + //let's find a base addr that is free before we start the tests + size = 5 * page_size; + base_addr = find_base_addr(size); + if (!base_addr) { + printf("Error: couldn't map the space we need for the test\n"); + return 1; + } + + flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE; + + // Check we can map all the areas we need below + errno = 0; + addr = base_addr; + size = 5 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p == MAP_FAILED) { + dump_maps(); + printf("Error: couldn't map the space we need for the test\n"); + return 1; + } + + errno = 0; + if (munmap((void *)addr, 5 * page_size) != 0) { + dump_maps(); + printf("Error: munmap failed!?\n"); + return 1; + } + printf("unmap() successful\n"); + + errno = 0; + addr = base_addr + page_size; + size = 3 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p == MAP_FAILED) { + dump_maps(); + printf("Error: first mmap() failed unexpectedly\n"); + return 1; + } + + /* + * Exact same mapping again: + * base | free | new + * +1 | mapped | new + * +2 | mapped | new + * +3 | mapped | new + * +4 | free | new + */ + errno = 0; + addr = base_addr; + size = 5 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p != MAP_FAILED) { + dump_maps(); + printf("Error:1: mmap() succeeded when it shouldn't have\n"); + return 1; + } + + /* + * Second mapping contained within first: + * + * base | free | + * +1 | mapped | + * +2 | mapped | new + * +3 | mapped | + * +4 | free | + */ + errno = 0; + addr = base_addr + (2 * page_size); + size = page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p != MAP_FAILED) { + dump_maps(); + printf("Error:2: mmap() succeeded when it shouldn't have\n"); + return 1; + } + + /* + * Overlap end of existing mapping: + * base | free | + * +1 | mapped | + * +2 | mapped | + * +3 | mapped | new + * +4 | free | new + */ + errno = 0; + addr = base_addr + (3 * page_size); + size = 2 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p != MAP_FAILED) { + dump_maps(); + printf("Error:3: mmap() succeeded when it shouldn't have\n"); + return 1; + } + + /* + * Overlap start of existing mapping: + * base | free | new + * +1 | mapped | new + * +2 | mapped | + * +3 | mapped | + * +4 | free | + */ + errno = 0; + addr = base_addr; + size = 2 * page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p != MAP_FAILED) { + dump_maps(); + printf("Error:4: mmap() succeeded when it shouldn't have\n"); + return 1; + } + + /* + * Adjacent to start of existing mapping: + * base | free | new + * +1 | mapped | + * +2 | mapped | + * +3 | mapped | + * +4 | free | + */ + errno = 0; + addr = base_addr; + size = page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p == MAP_FAILED) { + dump_maps(); + printf("Error:5: mmap() failed when it shouldn't have\n"); + return 1; + } + + /* + * Adjacent to end of existing mapping: + * base | free | + * +1 | mapped | + * +2 | mapped | + * +3 | mapped | + * +4 | free | new + */ + errno = 0; + addr = base_addr + (4 * page_size); + size = page_size; + p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); + printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); + + if (p == MAP_FAILED) { + dump_maps(); + printf("Error:6: mmap() failed when it shouldn't have\n"); + return 1; + } + + addr = base_addr; + size = 5 * page_size; + if (munmap((void *)addr, size) != 0) { + dump_maps(); + printf("Error: munmap failed!?\n"); + return 1; + } + printf("unmap() successful\n"); + + printf("OK\n"); + return 0; +} --- /dev/null +++ a/tools/testing/selftests/mm/map_hugetlb.c @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Example of using hugepage memory in a user application using the mmap + * system call with MAP_HUGETLB flag. Before running this program make + * sure the administrator has allocated enough default sized huge pages + * to cover the 256 MB allocation. + * + * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. + * That means the addresses starting with 0x800000... will need to be + * specified. Specifying a fixed address is not required on ppc64, i386 + * or x86_64. + */ +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/mman.h> +#include <fcntl.h> + +#define LENGTH (256UL*1024*1024) +#define PROTECTION (PROT_READ | PROT_WRITE) + +#ifndef MAP_HUGETLB +#define MAP_HUGETLB 0x40000 /* arch specific */ +#endif + +#ifndef MAP_HUGE_SHIFT +#define MAP_HUGE_SHIFT 26 +#endif + +#ifndef MAP_HUGE_MASK +#define MAP_HUGE_MASK 0x3f +#endif + +/* Only ia64 requires this */ +#ifdef __ia64__ +#define ADDR (void *)(0x8000000000000000UL) +#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) +#else +#define ADDR (void *)(0x0UL) +#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) +#endif + +static void check_bytes(char *addr) +{ + printf("First hex is %x\n", *((unsigned int *)addr)); +} + +static void write_bytes(char *addr, size_t length) +{ + unsigned long i; + + for (i = 0; i < length; i++) + *(addr + i) = (char)i; +} + +static int read_bytes(char *addr, size_t length) +{ + unsigned long i; + + check_bytes(addr); + for (i = 0; i < length; i++) + if (*(addr + i) != (char)i) { + printf("Mismatch at %lu\n", i); + return 1; + } + return 0; +} + +int main(int argc, char **argv) +{ + void *addr; + int ret; + size_t length = LENGTH; + int flags = FLAGS; + int shift = 0; + + if (argc > 1) + length = atol(argv[1]) << 20; + if (argc > 2) { + shift = atoi(argv[2]); + if (shift) + flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; + } + + if (shift) + printf("%u kB hugepages\n", 1 << (shift - 10)); + else + printf("Default size hugepages\n"); + printf("Mapping %lu Mbytes\n", (unsigned long)length >> 20); + + addr = mmap(ADDR, length, PROTECTION, flags, -1, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(1); + } + + printf("Returned address is %p\n", addr); + check_bytes(addr); + write_bytes(addr, length); + ret = read_bytes(addr, length); + + /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ + if (munmap(addr, length)) { + perror("munmap"); + exit(1); + } + + return ret; +} --- /dev/null +++ a/tools/testing/selftests/mm/map_populate.c @@ -0,0 +1,113 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018 Dmitry Safonov, Arista Networks + * + * MAP_POPULATE | MAP_PRIVATE should COW VMA pages. + */ + +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#ifndef MMAP_SZ +#define MMAP_SZ 4096 +#endif + +#define BUG_ON(condition, description) \ + do { \ + if (condition) { \ + fprintf(stderr, "[FAIL]\t%s:%d\t%s:%s\n", __func__, \ + __LINE__, (description), strerror(errno)); \ + exit(1); \ + } \ + } while (0) + +static int parent_f(int sock, unsigned long *smap, int child) +{ + int status, ret; + + ret = read(sock, &status, sizeof(int)); + BUG_ON(ret <= 0, "read(sock)"); + + *smap = 0x22222BAD; + ret = msync(smap, MMAP_SZ, MS_SYNC); + BUG_ON(ret, "msync()"); + + ret = write(sock, &status, sizeof(int)); + BUG_ON(ret <= 0, "write(sock)"); + + waitpid(child, &status, 0); + BUG_ON(!WIFEXITED(status), "child in unexpected state"); + + return WEXITSTATUS(status); +} + +static int child_f(int sock, unsigned long *smap, int fd) +{ + int ret, buf = 0; + + smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_POPULATE, fd, 0); + BUG_ON(smap == MAP_FAILED, "mmap()"); + + BUG_ON(*smap != 0xdeadbabe, "MAP_PRIVATE | MAP_POPULATE changed file"); + + ret = write(sock, &buf, sizeof(int)); + BUG_ON(ret <= 0, "write(sock)"); + + ret = read(sock, &buf, sizeof(int)); + BUG_ON(ret <= 0, "read(sock)"); + + BUG_ON(*smap == 0x22222BAD, "MAP_POPULATE didn't COW private page"); + BUG_ON(*smap != 0xdeadbabe, "mapping was corrupted"); + + return 0; +} + +int main(int argc, char **argv) +{ + int sock[2], child, ret; + FILE *ftmp; + unsigned long *smap; + + ftmp = tmpfile(); + BUG_ON(ftmp == 0, "tmpfile()"); + + ret = ftruncate(fileno(ftmp), MMAP_SZ); + BUG_ON(ret, "ftruncate()"); + + smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE, + MAP_SHARED, fileno(ftmp), 0); + BUG_ON(smap == MAP_FAILED, "mmap()"); + + *smap = 0xdeadbabe; + /* Probably unnecessary, but let it be. */ + ret = msync(smap, MMAP_SZ, MS_SYNC); + BUG_ON(ret, "msync()"); + + ret = socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sock); + BUG_ON(ret, "socketpair()"); + + child = fork(); + BUG_ON(child == -1, "fork()"); + + if (child) { + ret = close(sock[0]); + BUG_ON(ret, "close()"); + + return parent_f(sock[1], smap, child); + } + + ret = close(sock[1]); + BUG_ON(ret, "close()"); + + return child_f(sock[0], smap, fileno(ftmp)); +} --- /dev/null +++ a/tools/testing/selftests/mm/memfd_secret.c @@ -0,0 +1,296 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corporation, 2021 + * + * Author: Mike Rapoport <rppt@xxxxxxxxxxxxx> + */ + +#define _GNU_SOURCE +#include <sys/uio.h> +#include <sys/mman.h> +#include <sys/wait.h> +#include <sys/types.h> +#include <sys/ptrace.h> +#include <sys/syscall.h> +#include <sys/resource.h> +#include <sys/capability.h> + +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <stdio.h> + +#include "../kselftest.h" + +#define fail(fmt, ...) ksft_test_result_fail(fmt, ##__VA_ARGS__) +#define pass(fmt, ...) ksft_test_result_pass(fmt, ##__VA_ARGS__) +#define skip(fmt, ...) ksft_test_result_skip(fmt, ##__VA_ARGS__) + +#ifdef __NR_memfd_secret + +#define PATTERN 0x55 + +static const int prot = PROT_READ | PROT_WRITE; +static const int mode = MAP_SHARED; + +static unsigned long page_size; +static unsigned long mlock_limit_cur; +static unsigned long mlock_limit_max; + +static int memfd_secret(unsigned int flags) +{ + return syscall(__NR_memfd_secret, flags); +} + +static void test_file_apis(int fd) +{ + char buf[64]; + + if ((read(fd, buf, sizeof(buf)) >= 0) || + (write(fd, buf, sizeof(buf)) >= 0) || + (pread(fd, buf, sizeof(buf), 0) >= 0) || + (pwrite(fd, buf, sizeof(buf), 0) >= 0)) + fail("unexpected file IO\n"); + else + pass("file IO is blocked as expected\n"); +} + +static void test_mlock_limit(int fd) +{ + size_t len; + char *mem; + + len = mlock_limit_cur; + mem = mmap(NULL, len, prot, mode, fd, 0); + if (mem == MAP_FAILED) { + fail("unable to mmap secret memory\n"); + return; + } + munmap(mem, len); + + len = mlock_limit_max * 2; + mem = mmap(NULL, len, prot, mode, fd, 0); + if (mem != MAP_FAILED) { + fail("unexpected mlock limit violation\n"); + munmap(mem, len); + return; + } + + pass("mlock limit is respected\n"); +} + +static void try_process_vm_read(int fd, int pipefd[2]) +{ + struct iovec liov, riov; + char buf[64]; + char *mem; + + if (read(pipefd[0], &mem, sizeof(mem)) < 0) { + fail("pipe write: %s\n", strerror(errno)); + exit(KSFT_FAIL); + } + + liov.iov_len = riov.iov_len = sizeof(buf); + liov.iov_base = buf; + riov.iov_base = mem; + + if (process_vm_readv(getppid(), &liov, 1, &riov, 1, 0) < 0) { + if (errno == ENOSYS) + exit(KSFT_SKIP); + exit(KSFT_PASS); + } + + exit(KSFT_FAIL); +} + +static void try_ptrace(int fd, int pipefd[2]) +{ + pid_t ppid = getppid(); + int status; + char *mem; + long ret; + + if (read(pipefd[0], &mem, sizeof(mem)) < 0) { + perror("pipe write"); + exit(KSFT_FAIL); + } + + ret = ptrace(PTRACE_ATTACH, ppid, 0, 0); + if (ret) { + perror("ptrace_attach"); + exit(KSFT_FAIL); + } + + ret = waitpid(ppid, &status, WUNTRACED); + if ((ret != ppid) || !(WIFSTOPPED(status))) { + fprintf(stderr, "weird waitppid result %ld stat %x\n", + ret, status); + exit(KSFT_FAIL); + } + + if (ptrace(PTRACE_PEEKDATA, ppid, mem, 0)) + exit(KSFT_PASS); + + exit(KSFT_FAIL); +} + +static void check_child_status(pid_t pid, const char *name) +{ + int status; + + waitpid(pid, &status, 0); + + if (WIFEXITED(status) && WEXITSTATUS(status) == KSFT_SKIP) { + skip("%s is not supported\n", name); + return; + } + + if ((WIFEXITED(status) && WEXITSTATUS(status) == KSFT_PASS) || + WIFSIGNALED(status)) { + pass("%s is blocked as expected\n", name); + return; + } + + fail("%s: unexpected memory access\n", name); +} + +static void test_remote_access(int fd, const char *name, + void (*func)(int fd, int pipefd[2])) +{ + int pipefd[2]; + pid_t pid; + char *mem; + + if (pipe(pipefd)) { + fail("pipe failed: %s\n", strerror(errno)); + return; + } + + pid = fork(); + if (pid < 0) { + fail("fork failed: %s\n", strerror(errno)); + return; + } + + if (pid == 0) { + func(fd, pipefd); + return; + } + + mem = mmap(NULL, page_size, prot, mode, fd, 0); + if (mem == MAP_FAILED) { + fail("Unable to mmap secret memory\n"); + return; + } + + ftruncate(fd, page_size); + memset(mem, PATTERN, page_size); + + if (write(pipefd[1], &mem, sizeof(mem)) < 0) { + fail("pipe write: %s\n", strerror(errno)); + return; + } + + check_child_status(pid, name); +} + +static void test_process_vm_read(int fd) +{ + test_remote_access(fd, "process_vm_read", try_process_vm_read); +} + +static void test_ptrace(int fd) +{ + test_remote_access(fd, "ptrace", try_ptrace); +} + +static int set_cap_limits(rlim_t max) +{ + struct rlimit new; + cap_t cap = cap_init(); + + new.rlim_cur = max; + new.rlim_max = max; + if (setrlimit(RLIMIT_MEMLOCK, &new)) { + perror("setrlimit() returns error"); + return -1; + } + + /* drop capabilities including CAP_IPC_LOCK */ + if (cap_set_proc(cap)) { + perror("cap_set_proc() returns error"); + return -2; + } + + return 0; +} + +static void prepare(void) +{ + struct rlimit rlim; + + page_size = sysconf(_SC_PAGE_SIZE); + if (!page_size) + ksft_exit_fail_msg("Failed to get page size %s\n", + strerror(errno)); + + if (getrlimit(RLIMIT_MEMLOCK, &rlim)) + ksft_exit_fail_msg("Unable to detect mlock limit: %s\n", + strerror(errno)); + + mlock_limit_cur = rlim.rlim_cur; + mlock_limit_max = rlim.rlim_max; + + printf("page_size: %ld, mlock.soft: %ld, mlock.hard: %ld\n", + page_size, mlock_limit_cur, mlock_limit_max); + + if (page_size > mlock_limit_cur) + mlock_limit_cur = page_size; + if (page_size > mlock_limit_max) + mlock_limit_max = page_size; + + if (set_cap_limits(mlock_limit_max)) + ksft_exit_fail_msg("Unable to set mlock limit: %s\n", + strerror(errno)); +} + +#define NUM_TESTS 4 + +int main(int argc, char *argv[]) +{ + int fd; + + prepare(); + + ksft_print_header(); + ksft_set_plan(NUM_TESTS); + + fd = memfd_secret(0); + if (fd < 0) { + if (errno == ENOSYS) + ksft_exit_skip("memfd_secret is not supported\n"); + else + ksft_exit_fail_msg("memfd_secret failed: %s\n", + strerror(errno)); + } + + test_mlock_limit(fd); + test_file_apis(fd); + test_process_vm_read(fd); + test_ptrace(fd); + + close(fd); + + ksft_finished(); +} + +#else /* __NR_memfd_secret */ + +int main(int argc, char *argv[]) +{ + printf("skip: skipping memfd_secret test (missing __NR_memfd_secret)\n"); + return KSFT_SKIP; +} + +#endif /* __NR_memfd_secret */ --- /dev/null +++ a/tools/testing/selftests/mm/migration.c @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * The main purpose of the tests here is to exercise the migration entry code + * paths in the kernel. + */ + +#include "../kselftest_harness.h" +#include <strings.h> +#include <pthread.h> +#include <numa.h> +#include <numaif.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <signal.h> +#include <time.h> + +#define TWOMEG (2<<20) +#define RUNTIME (60) + +#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) + +FIXTURE(migration) +{ + pthread_t *threads; + pid_t *pids; + int nthreads; + int n1; + int n2; +}; + +FIXTURE_SETUP(migration) +{ + int n; + + ASSERT_EQ(numa_available(), 0); + self->nthreads = numa_num_task_cpus() - 1; + self->n1 = -1; + self->n2 = -1; + + for (n = 0; n < numa_max_possible_node(); n++) + if (numa_bitmask_isbitset(numa_all_nodes_ptr, n)) { + if (self->n1 == -1) { + self->n1 = n; + } else { + self->n2 = n; + break; + } + } + + self->threads = malloc(self->nthreads * sizeof(*self->threads)); + ASSERT_NE(self->threads, NULL); + self->pids = malloc(self->nthreads * sizeof(*self->pids)); + ASSERT_NE(self->pids, NULL); +}; + +FIXTURE_TEARDOWN(migration) +{ + free(self->threads); + free(self->pids); +} + +int migrate(uint64_t *ptr, int n1, int n2) +{ + int ret, tmp; + int status = 0; + struct timespec ts1, ts2; + + if (clock_gettime(CLOCK_MONOTONIC, &ts1)) + return -1; + + while (1) { + if (clock_gettime(CLOCK_MONOTONIC, &ts2)) + return -1; + + if (ts2.tv_sec - ts1.tv_sec >= RUNTIME) + return 0; + + ret = move_pages(0, 1, (void **) &ptr, &n2, &status, + MPOL_MF_MOVE_ALL); + if (ret) { + if (ret > 0) + printf("Didn't migrate %d pages\n", ret); + else + perror("Couldn't migrate pages"); + return -2; + } + + tmp = n2; + n2 = n1; + n1 = tmp; + } + + return 0; +} + +void *access_mem(void *ptr) +{ + uint64_t y = 0; + volatile uint64_t *x = ptr; + + while (1) { + pthread_testcancel(); + y += *x; + } + + return NULL; +} + +/* + * Basic migration entry testing. One thread will move pages back and forth + * between nodes whilst other threads try and access them triggering the + * migration entry wait paths in the kernel. + */ +TEST_F_TIMEOUT(migration, private_anon, 2*RUNTIME) +{ + uint64_t *ptr; + int i; + + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) + SKIP(return, "Not enough threads or NUMA nodes available"); + + ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + memset(ptr, 0xde, TWOMEG); + for (i = 0; i < self->nthreads - 1; i++) + if (pthread_create(&self->threads[i], NULL, access_mem, ptr)) + perror("Couldn't create thread"); + + ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); + for (i = 0; i < self->nthreads - 1; i++) + ASSERT_EQ(pthread_cancel(self->threads[i]), 0); +} + +/* + * Same as the previous test but with shared memory. + */ +TEST_F_TIMEOUT(migration, shared_anon, 2*RUNTIME) +{ + pid_t pid; + uint64_t *ptr; + int i; + + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) + SKIP(return, "Not enough threads or NUMA nodes available"); + + ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + memset(ptr, 0xde, TWOMEG); + for (i = 0; i < self->nthreads - 1; i++) { + pid = fork(); + if (!pid) + access_mem(ptr); + else + self->pids[i] = pid; + } + + ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); + for (i = 0; i < self->nthreads - 1; i++) + ASSERT_EQ(kill(self->pids[i], SIGTERM), 0); +} + +/* + * Tests the pmd migration entry paths. + */ +TEST_F_TIMEOUT(migration, private_anon_thp, 2*RUNTIME) +{ + uint64_t *ptr; + int i; + + if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) + SKIP(return, "Not enough threads or NUMA nodes available"); + + ptr = mmap(NULL, 2*TWOMEG, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + ptr = (uint64_t *) ALIGN((uintptr_t) ptr, TWOMEG); + ASSERT_EQ(madvise(ptr, TWOMEG, MADV_HUGEPAGE), 0); + memset(ptr, 0xde, TWOMEG); + for (i = 0; i < self->nthreads - 1; i++) + if (pthread_create(&self->threads[i], NULL, access_mem, ptr)) + perror("Couldn't create thread"); + + ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); + for (i = 0; i < self->nthreads - 1; i++) + ASSERT_EQ(pthread_cancel(self->threads[i]), 0); +} + +TEST_HARNESS_MAIN --- /dev/null +++ a/tools/testing/selftests/mm/mlock2.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <syscall.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> + +#ifndef MLOCK_ONFAULT +#define MLOCK_ONFAULT 1 +#endif + +#ifndef MCL_ONFAULT +#define MCL_ONFAULT (MCL_FUTURE << 1) +#endif + +static int mlock2_(void *start, size_t len, int flags) +{ +#ifdef __NR_mlock2 + return syscall(__NR_mlock2, start, len, flags); +#else + errno = ENOSYS; + return -1; +#endif +} + +static FILE *seek_to_smaps_entry(unsigned long addr) +{ + FILE *file; + char *line = NULL; + size_t size = 0; + unsigned long start, end; + char perms[5]; + unsigned long offset; + char dev[32]; + unsigned long inode; + char path[BUFSIZ]; + + file = fopen("/proc/self/smaps", "r"); + if (!file) { + perror("fopen smaps"); + _exit(1); + } + + while (getline(&line, &size, file) > 0) { + if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n", + &start, &end, perms, &offset, dev, &inode, path) < 6) + goto next; + + if (start <= addr && addr < end) + goto out; + +next: + free(line); + line = NULL; + size = 0; + } + + fclose(file); + file = NULL; + +out: + free(line); + return file; +} --- /dev/null +++ a/tools/testing/selftests/mm/mlock2-tests.c @@ -0,0 +1,520 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include <sys/mman.h> +#include <stdint.h> +#include <unistd.h> +#include <string.h> +#include <sys/time.h> +#include <sys/resource.h> +#include <stdbool.h> +#include "mlock2.h" + +#include "../kselftest.h" + +struct vm_boundaries { + unsigned long start; + unsigned long end; +}; + +static int get_vm_area(unsigned long addr, struct vm_boundaries *area) +{ + FILE *file; + int ret = 1; + char line[1024] = {0}; + char *end_addr; + char *stop; + unsigned long start; + unsigned long end; + + if (!area) + return ret; + + file = fopen("/proc/self/maps", "r"); + if (!file) { + perror("fopen"); + return ret; + } + + memset(area, 0, sizeof(struct vm_boundaries)); + + while(fgets(line, 1024, file)) { + end_addr = strchr(line, '-'); + if (!end_addr) { + printf("cannot parse /proc/self/maps\n"); + goto out; + } + *end_addr = '\0'; + end_addr++; + stop = strchr(end_addr, ' '); + if (!stop) { + printf("cannot parse /proc/self/maps\n"); + goto out; + } + stop = '\0'; + + sscanf(line, "%lx", &start); + sscanf(end_addr, "%lx", &end); + + if (start <= addr && end > addr) { + area->start = start; + area->end = end; + ret = 0; + goto out; + } + } +out: + fclose(file); + return ret; +} + +#define VMFLAGS "VmFlags:" + +static bool is_vmflag_set(unsigned long addr, const char *vmflag) +{ + char *line = NULL; + char *flags; + size_t size = 0; + bool ret = false; + FILE *smaps; + + smaps = seek_to_smaps_entry(addr); + if (!smaps) { + printf("Unable to parse /proc/self/smaps\n"); + goto out; + } + + while (getline(&line, &size, smaps) > 0) { + if (!strstr(line, VMFLAGS)) { + free(line); + line = NULL; + size = 0; + continue; + } + + flags = line + strlen(VMFLAGS); + ret = (strstr(flags, vmflag) != NULL); + goto out; + } + +out: + free(line); + fclose(smaps); + return ret; +} + +#define SIZE "Size:" +#define RSS "Rss:" +#define LOCKED "lo" + +static unsigned long get_value_for_name(unsigned long addr, const char *name) +{ + char *line = NULL; + size_t size = 0; + char *value_ptr; + FILE *smaps = NULL; + unsigned long value = -1UL; + + smaps = seek_to_smaps_entry(addr); + if (!smaps) { + printf("Unable to parse /proc/self/smaps\n"); + goto out; + } + + while (getline(&line, &size, smaps) > 0) { + if (!strstr(line, name)) { + free(line); + line = NULL; + size = 0; + continue; + } + + value_ptr = line + strlen(name); + if (sscanf(value_ptr, "%lu kB", &value) < 1) { + printf("Unable to parse smaps entry for Size\n"); + goto out; + } + break; + } + +out: + if (smaps) + fclose(smaps); + free(line); + return value; +} + +static bool is_vma_lock_on_fault(unsigned long addr) +{ + bool locked; + unsigned long vma_size, vma_rss; + + locked = is_vmflag_set(addr, LOCKED); + if (!locked) + return false; + + vma_size = get_value_for_name(addr, SIZE); + vma_rss = get_value_for_name(addr, RSS); + + /* only one page is faulted in */ + return (vma_rss < vma_size); +} + +#define PRESENT_BIT 0x8000000000000000ULL +#define PFN_MASK 0x007FFFFFFFFFFFFFULL +#define UNEVICTABLE_BIT (1UL << 18) + +static int lock_check(unsigned long addr) +{ + bool locked; + unsigned long vma_size, vma_rss; + + locked = is_vmflag_set(addr, LOCKED); + if (!locked) + return false; + + vma_size = get_value_for_name(addr, SIZE); + vma_rss = get_value_for_name(addr, RSS); + + return (vma_rss == vma_size); +} + +static int unlock_lock_check(char *map) +{ + if (is_vmflag_set((unsigned long)map, LOCKED)) { + printf("VMA flag %s is present on page 1 after unlock\n", LOCKED); + return 1; + } + + return 0; +} + +static int test_mlock_lock() +{ + char *map; + int ret = 1; + unsigned long page_size = getpagesize(); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map == MAP_FAILED) { + perror("test_mlock_locked mmap"); + goto out; + } + + if (mlock2_(map, 2 * page_size, 0)) { + if (errno == ENOSYS) { + printf("Cannot call new mlock family, skipping test\n"); + _exit(KSFT_SKIP); + } + perror("mlock2(0)"); + goto unmap; + } + + if (!lock_check((unsigned long)map)) + goto unmap; + + /* Now unlock and recheck attributes */ + if (munlock(map, 2 * page_size)) { + perror("munlock()"); + goto unmap; + } + + ret = unlock_lock_check(map); + +unmap: + munmap(map, 2 * page_size); +out: + return ret; +} + +static int onfault_check(char *map) +{ + *map = 'a'; + if (!is_vma_lock_on_fault((unsigned long)map)) { + printf("VMA is not marked for lock on fault\n"); + return 1; + } + + return 0; +} + +static int unlock_onfault_check(char *map) +{ + unsigned long page_size = getpagesize(); + + if (is_vma_lock_on_fault((unsigned long)map) || + is_vma_lock_on_fault((unsigned long)map + page_size)) { + printf("VMA is still lock on fault after unlock\n"); + return 1; + } + + return 0; +} + +static int test_mlock_onfault() +{ + char *map; + int ret = 1; + unsigned long page_size = getpagesize(); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map == MAP_FAILED) { + perror("test_mlock_locked mmap"); + goto out; + } + + if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) { + if (errno == ENOSYS) { + printf("Cannot call new mlock family, skipping test\n"); + _exit(KSFT_SKIP); + } + perror("mlock2(MLOCK_ONFAULT)"); + goto unmap; + } + + if (onfault_check(map)) + goto unmap; + + /* Now unlock and recheck attributes */ + if (munlock(map, 2 * page_size)) { + if (errno == ENOSYS) { + printf("Cannot call new mlock family, skipping test\n"); + _exit(KSFT_SKIP); + } + perror("munlock()"); + goto unmap; + } + + ret = unlock_onfault_check(map); +unmap: + munmap(map, 2 * page_size); +out: + return ret; +} + +static int test_lock_onfault_of_present() +{ + char *map; + int ret = 1; + unsigned long page_size = getpagesize(); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map == MAP_FAILED) { + perror("test_mlock_locked mmap"); + goto out; + } + + *map = 'a'; + + if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) { + if (errno == ENOSYS) { + printf("Cannot call new mlock family, skipping test\n"); + _exit(KSFT_SKIP); + } + perror("mlock2(MLOCK_ONFAULT)"); + goto unmap; + } + + if (!is_vma_lock_on_fault((unsigned long)map) || + !is_vma_lock_on_fault((unsigned long)map + page_size)) { + printf("VMA with present pages is not marked lock on fault\n"); + goto unmap; + } + ret = 0; +unmap: + munmap(map, 2 * page_size); +out: + return ret; +} + +static int test_munlockall() +{ + char *map; + int ret = 1; + unsigned long page_size = getpagesize(); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + + if (map == MAP_FAILED) { + perror("test_munlockall mmap"); + goto out; + } + + if (mlockall(MCL_CURRENT)) { + perror("mlockall(MCL_CURRENT)"); + goto out; + } + + if (!lock_check((unsigned long)map)) + goto unmap; + + if (munlockall()) { + perror("munlockall()"); + goto unmap; + } + + if (unlock_lock_check(map)) + goto unmap; + + munmap(map, 2 * page_size); + + map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + + if (map == MAP_FAILED) { + perror("test_munlockall second mmap"); + goto out; + } + + if (mlockall(MCL_CURRENT | MCL_ONFAULT)) { + perror("mlockall(MCL_CURRENT | MCL_ONFAULT)"); + goto unmap; + } + + if (onfault_check(map)) + goto unmap; + + if (munlockall()) { + perror("munlockall()"); + goto unmap; + } + + if (unlock_onfault_check(map)) + goto unmap; + + if (mlockall(MCL_CURRENT | MCL_FUTURE)) { + perror("mlockall(MCL_CURRENT | MCL_FUTURE)"); + goto out; + } + + if (!lock_check((unsigned long)map)) + goto unmap; + + if (munlockall()) { + perror("munlockall()"); + goto unmap; + } + + ret = unlock_lock_check(map); + +unmap: + munmap(map, 2 * page_size); +out: + munlockall(); + return ret; +} + +static int test_vma_management(bool call_mlock) +{ + int ret = 1; + void *map; + unsigned long page_size = getpagesize(); + struct vm_boundaries page1; + struct vm_boundaries page2; + struct vm_boundaries page3; + + map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (map == MAP_FAILED) { + perror("mmap()"); + return ret; + } + + if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) { + if (errno == ENOSYS) { + printf("Cannot call new mlock family, skipping test\n"); + _exit(KSFT_SKIP); + } + perror("mlock(ONFAULT)\n"); + goto out; + } + + if (get_vm_area((unsigned long)map, &page1) || + get_vm_area((unsigned long)map + page_size, &page2) || + get_vm_area((unsigned long)map + page_size * 2, &page3)) { + printf("couldn't find mapping in /proc/self/maps\n"); + goto out; + } + + /* + * Before we unlock a portion, we need to that all three pages are in + * the same VMA. If they are not we abort this test (Note that this is + * not a failure) + */ + if (page1.start != page2.start || page2.start != page3.start) { + printf("VMAs are not merged to start, aborting test\n"); + ret = 0; + goto out; + } + + if (munlock(map + page_size, page_size)) { + perror("munlock()"); + goto out; + } + + if (get_vm_area((unsigned long)map, &page1) || + get_vm_area((unsigned long)map + page_size, &page2) || + get_vm_area((unsigned long)map + page_size * 2, &page3)) { + printf("couldn't find mapping in /proc/self/maps\n"); + goto out; + } + + /* All three VMAs should be different */ + if (page1.start == page2.start || page2.start == page3.start) { + printf("failed to split VMA for munlock\n"); + goto out; + } + + /* Now unlock the first and third page and check the VMAs again */ + if (munlock(map, page_size * 3)) { + perror("munlock()"); + goto out; + } + + if (get_vm_area((unsigned long)map, &page1) || + get_vm_area((unsigned long)map + page_size, &page2) || + get_vm_area((unsigned long)map + page_size * 2, &page3)) { + printf("couldn't find mapping in /proc/self/maps\n"); + goto out; + } + + /* Now all three VMAs should be the same */ + if (page1.start != page2.start || page2.start != page3.start) { + printf("failed to merge VMAs after munlock\n"); + goto out; + } + + ret = 0; +out: + munmap(map, 3 * page_size); + return ret; +} + +static int test_mlockall(int (test_function)(bool call_mlock)) +{ + int ret = 1; + + if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) { + perror("mlockall"); + return ret; + } + + ret = test_function(false); + munlockall(); + return ret; +} + +int main(int argc, char **argv) +{ + int ret = 0; + ret += test_mlock_lock(); + ret += test_mlock_onfault(); + ret += test_munlockall(); + ret += test_lock_onfault_of_present(); + ret += test_vma_management(true); + ret += test_mlockall(test_vma_management); + return ret; +} --- /dev/null +++ a/tools/testing/selftests/mm/mlock-random-test.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * It tests the mlock/mlock2() when they are invoked + * on randomly memory region. + */ +#include <unistd.h> +#include <sys/resource.h> +#include <sys/capability.h> +#include <sys/mman.h> +#include <fcntl.h> +#include <string.h> +#include <sys/ipc.h> +#include <sys/shm.h> +#include <time.h> +#include "mlock2.h" + +#define CHUNK_UNIT (128 * 1024) +#define MLOCK_RLIMIT_SIZE (CHUNK_UNIT * 2) +#define MLOCK_WITHIN_LIMIT_SIZE CHUNK_UNIT +#define MLOCK_OUTOF_LIMIT_SIZE (CHUNK_UNIT * 3) + +#define TEST_LOOP 100 +#define PAGE_ALIGN(size, ps) (((size) + ((ps) - 1)) & ~((ps) - 1)) + +int set_cap_limits(rlim_t max) +{ + struct rlimit new; + cap_t cap = cap_init(); + + new.rlim_cur = max; + new.rlim_max = max; + if (setrlimit(RLIMIT_MEMLOCK, &new)) { + perror("setrlimit() returns error\n"); + return -1; + } + + /* drop capabilities including CAP_IPC_LOCK */ + if (cap_set_proc(cap)) { + perror("cap_set_proc() returns error\n"); + return -2; + } + + return 0; +} + +int get_proc_locked_vm_size(void) +{ + FILE *f; + int ret = -1; + char line[1024] = {0}; + unsigned long lock_size = 0; + + f = fopen("/proc/self/status", "r"); + if (!f) { + perror("fopen"); + return -1; + } + + while (fgets(line, 1024, f)) { + if (strstr(line, "VmLck")) { + ret = sscanf(line, "VmLck:\t%8lu kB", &lock_size); + if (ret <= 0) { + printf("sscanf() on VmLck error: %s: %d\n", + line, ret); + fclose(f); + return -1; + } + fclose(f); + return (int)(lock_size << 10); + } + } + + perror("cannot parse VmLck in /proc/self/status\n"); + fclose(f); + return -1; +} + +/* + * Get the MMUPageSize of the memory region including input + * address from proc file. + * + * return value: on error case, 0 will be returned. + * Otherwise the page size(in bytes) is returned. + */ +int get_proc_page_size(unsigned long addr) +{ + FILE *smaps; + char *line; + unsigned long mmupage_size = 0; + size_t size; + + smaps = seek_to_smaps_entry(addr); + if (!smaps) { + printf("Unable to parse /proc/self/smaps\n"); + return 0; + } + + while (getline(&line, &size, smaps) > 0) { + if (!strstr(line, "MMUPageSize")) { + free(line); + line = NULL; + size = 0; + continue; + } + + /* found the MMUPageSize of this section */ + if (sscanf(line, "MMUPageSize: %8lu kB", + &mmupage_size) < 1) { + printf("Unable to parse smaps entry for Size:%s\n", + line); + break; + } + + } + free(line); + if (smaps) + fclose(smaps); + return mmupage_size << 10; +} + +/* + * Test mlock/mlock2() on provided memory chunk. + * It expects the mlock/mlock2() to be successful (within rlimit) + * + * With allocated memory chunk [p, p + alloc_size), this + * test will choose start/len randomly to perform mlock/mlock2 + * [start, start + len] memory range. The range is within range + * of the allocated chunk. + * + * The memory region size alloc_size is within the rlimit. + * So we always expect a success of mlock/mlock2. + * + * VmLck is assumed to be 0 before this test. + * + * return value: 0 - success + * else: failure + */ +int test_mlock_within_limit(char *p, int alloc_size) +{ + int i; + int ret = 0; + int locked_vm_size = 0; + struct rlimit cur; + int page_size = 0; + + getrlimit(RLIMIT_MEMLOCK, &cur); + if (cur.rlim_cur < alloc_size) { + printf("alloc_size[%d] < %u rlimit,lead to mlock failure\n", + alloc_size, (unsigned int)cur.rlim_cur); + return -1; + } + + srand(time(NULL)); + for (i = 0; i < TEST_LOOP; i++) { + /* + * - choose mlock/mlock2 randomly + * - choose lock_size randomly but lock_size < alloc_size + * - choose start_offset randomly but p+start_offset+lock_size + * < p+alloc_size + */ + int is_mlock = !!(rand() % 2); + int lock_size = rand() % alloc_size; + int start_offset = rand() % (alloc_size - lock_size); + + if (is_mlock) + ret = mlock(p + start_offset, lock_size); + else + ret = mlock2_(p + start_offset, lock_size, + MLOCK_ONFAULT); + + if (ret) { + printf("%s() failure at |%p(%d)| mlock:|%p(%d)|\n", + is_mlock ? "mlock" : "mlock2", + p, alloc_size, + p + start_offset, lock_size); + return ret; + } + } + + /* + * Check VmLck left by the tests. + */ + locked_vm_size = get_proc_locked_vm_size(); + page_size = get_proc_page_size((unsigned long)p); + if (page_size == 0) { + printf("cannot get proc MMUPageSize\n"); + return -1; + } + + if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size) { + printf("test_mlock_within_limit() left VmLck:%d on %d chunk\n", + locked_vm_size, alloc_size); + return -1; + } + + return 0; +} + + +/* + * We expect the mlock/mlock2() to be fail (outof limitation) + * + * With allocated memory chunk [p, p + alloc_size), this + * test will randomly choose start/len and perform mlock/mlock2 + * on [start, start+len] range. + * + * The memory region size alloc_size is above the rlimit. + * And the len to be locked is higher than rlimit. + * So we always expect a failure of mlock/mlock2. + * No locked page number should be increased as a side effect. + * + * return value: 0 - success + * else: failure + */ +int test_mlock_outof_limit(char *p, int alloc_size) +{ + int i; + int ret = 0; + int locked_vm_size = 0, old_locked_vm_size = 0; + struct rlimit cur; + + getrlimit(RLIMIT_MEMLOCK, &cur); + if (cur.rlim_cur >= alloc_size) { + printf("alloc_size[%d] >%u rlimit, violates test condition\n", + alloc_size, (unsigned int)cur.rlim_cur); + return -1; + } + + old_locked_vm_size = get_proc_locked_vm_size(); + srand(time(NULL)); + for (i = 0; i < TEST_LOOP; i++) { + int is_mlock = !!(rand() % 2); + int lock_size = (rand() % (alloc_size - cur.rlim_cur)) + + cur.rlim_cur; + int start_offset = rand() % (alloc_size - lock_size); + + if (is_mlock) + ret = mlock(p + start_offset, lock_size); + else + ret = mlock2_(p + start_offset, lock_size, + MLOCK_ONFAULT); + if (ret == 0) { + printf("%s() succeeds? on %p(%d) mlock%p(%d)\n", + is_mlock ? "mlock" : "mlock2", + p, alloc_size, + p + start_offset, lock_size); + return -1; + } + } + + locked_vm_size = get_proc_locked_vm_size(); + if (locked_vm_size != old_locked_vm_size) { + printf("tests leads to new mlocked page: old[%d], new[%d]\n", + old_locked_vm_size, + locked_vm_size); + return -1; + } + + return 0; +} + +int main(int argc, char **argv) +{ + char *p = NULL; + int ret = 0; + + if (set_cap_limits(MLOCK_RLIMIT_SIZE)) + return -1; + + p = malloc(MLOCK_WITHIN_LIMIT_SIZE); + if (p == NULL) { + perror("malloc() failure\n"); + return -1; + } + ret = test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE); + if (ret) + return ret; + munlock(p, MLOCK_WITHIN_LIMIT_SIZE); + free(p); + + + p = malloc(MLOCK_OUTOF_LIMIT_SIZE); + if (p == NULL) { + perror("malloc() failure\n"); + return -1; + } + ret = test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE); + if (ret) + return ret; + munlock(p, MLOCK_OUTOF_LIMIT_SIZE); + free(p); + + return 0; +} --- /dev/null +++ a/tools/testing/selftests/mm/mrelease_test.c @@ -0,0 +1,206 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2022 Google LLC + */ +#define _GNU_SOURCE +#include <errno.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <sys/wait.h> +#include <unistd.h> + +#include "util.h" + +#include "../kselftest.h" + +#ifndef __NR_pidfd_open +#define __NR_pidfd_open -1 +#endif + +#ifndef __NR_process_mrelease +#define __NR_process_mrelease -1 +#endif + +#define MB(x) (x << 20) +#define MAX_SIZE_MB 1024 + +static int alloc_noexit(unsigned long nr_pages, int pipefd) +{ + int ppid = getppid(); + int timeout = 10; /* 10sec timeout to get killed */ + unsigned long i; + char *buf; + + buf = (char *)mmap(NULL, nr_pages * PAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, 0, 0); + if (buf == MAP_FAILED) { + perror("mmap failed, halting the test"); + return KSFT_FAIL; + } + + for (i = 0; i < nr_pages; i++) + *((unsigned long *)(buf + (i * PAGE_SIZE))) = i; + + /* Signal the parent that the child is ready */ + if (write(pipefd, "", 1) < 0) { + perror("write"); + return KSFT_FAIL; + } + + /* Wait to be killed (when reparenting happens) */ + while (getppid() == ppid && timeout > 0) { + sleep(1); + timeout--; + } + + munmap(buf, nr_pages * PAGE_SIZE); + + return (timeout > 0) ? KSFT_PASS : KSFT_FAIL; +} + +/* The process_mrelease calls in this test are expected to fail */ +static void run_negative_tests(int pidfd) +{ + int res; + /* Test invalid flags. Expect to fail with EINVAL error code. */ + if (!syscall(__NR_process_mrelease, pidfd, (unsigned int)-1) || + errno != EINVAL) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + perror("process_mrelease with wrong flags"); + exit(res); + } + /* + * Test reaping while process is alive with no pending SIGKILL. + * Expect to fail with EINVAL error code. + */ + if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + perror("process_mrelease on a live process"); + exit(res); + } +} + +static int child_main(int pipefd[], size_t size) +{ + int res; + + /* Allocate and fault-in memory and wait to be killed */ + close(pipefd[0]); + res = alloc_noexit(MB(size) / PAGE_SIZE, pipefd[1]); + close(pipefd[1]); + return res; +} + +int main(void) +{ + int pipefd[2], pidfd; + bool success, retry; + size_t size; + pid_t pid; + char byte; + int res; + + /* Test a wrong pidfd */ + if (!syscall(__NR_process_mrelease, -1, 0) || errno != EBADF) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + perror("process_mrelease with wrong pidfd"); + exit(res); + } + + /* Start the test with 1MB child memory allocation */ + size = 1; +retry: + /* + * Pipe for the child to signal when it's done allocating + * memory + */ + if (pipe(pipefd)) { + perror("pipe"); + exit(KSFT_FAIL); + } + pid = fork(); + if (pid < 0) { + perror("fork"); + close(pipefd[0]); + close(pipefd[1]); + exit(KSFT_FAIL); + } + + if (pid == 0) { + /* Child main routine */ + res = child_main(pipefd, size); + exit(res); + } + + /* + * Parent main routine: + * Wait for the child to finish allocations, then kill and reap + */ + close(pipefd[1]); + /* Block until the child is ready */ + res = read(pipefd[0], &byte, 1); + close(pipefd[0]); + if (res < 0) { + perror("read"); + if (!kill(pid, SIGKILL)) + waitpid(pid, NULL, 0); + exit(KSFT_FAIL); + } + + pidfd = syscall(__NR_pidfd_open, pid, 0); + if (pidfd < 0) { + perror("pidfd_open"); + if (!kill(pid, SIGKILL)) + waitpid(pid, NULL, 0); + exit(KSFT_FAIL); + } + + /* Run negative tests which require a live child */ + run_negative_tests(pidfd); + + if (kill(pid, SIGKILL)) { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + perror("kill"); + exit(res); + } + + success = (syscall(__NR_process_mrelease, pidfd, 0) == 0); + if (!success) { + /* + * If we failed to reap because the child exited too soon, + * before we could call process_mrelease. Double child's memory + * which causes it to spend more time on cleanup and increases + * our chances of reaping its memory before it exits. + * Retry until we succeed or reach MAX_SIZE_MB. + */ + if (errno == ESRCH) { + retry = (size <= MAX_SIZE_MB); + } else { + res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); + perror("process_mrelease"); + waitpid(pid, NULL, 0); + exit(res); + } + } + + /* Cleanup to prevent zombies */ + if (waitpid(pid, NULL, 0) < 0) { + perror("waitpid"); + exit(KSFT_FAIL); + } + close(pidfd); + + if (!success) { + if (retry) { + size *= 2; + goto retry; + } + printf("All process_mrelease attempts failed!\n"); + exit(KSFT_FAIL); + } + + printf("Success reaping a child with %zuMB of memory allocations\n", + size); + return KSFT_PASS; +} --- /dev/null +++ a/tools/testing/selftests/mm/mremap_dontunmap.c @@ -0,0 +1,364 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Tests for mremap w/ MREMAP_DONTUNMAP. + * + * Copyright 2020, Brian Geffon <bgeffon@xxxxxxxxxx> + */ +#define _GNU_SOURCE +#include <sys/mman.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include "../kselftest.h" + +#ifndef MREMAP_DONTUNMAP +#define MREMAP_DONTUNMAP 4 +#endif + +unsigned long page_size; +char *page_buffer; + +static void dump_maps(void) +{ + char cmd[32]; + + snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid()); + system(cmd); +} + +#define BUG_ON(condition, description) \ + do { \ + if (condition) { \ + fprintf(stderr, "[FAIL]\t%s():%d\t%s:%s\n", __func__, \ + __LINE__, (description), strerror(errno)); \ + dump_maps(); \ + exit(1); \ + } \ + } while (0) + +// Try a simple operation for to "test" for kernel support this prevents +// reporting tests as failed when it's run on an older kernel. +static int kernel_support_for_mremap_dontunmap() +{ + int ret = 0; + unsigned long num_pages = 1; + void *source_mapping = mmap(NULL, num_pages * page_size, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + + // This simple remap should only fail if MREMAP_DONTUNMAP isn't + // supported. + void *dest_mapping = + mremap(source_mapping, num_pages * page_size, num_pages * page_size, + MREMAP_DONTUNMAP | MREMAP_MAYMOVE, 0); + if (dest_mapping == MAP_FAILED) { + ret = errno; + } else { + BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, + "unable to unmap destination mapping"); + } + + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); + return ret; +} + +// This helper will just validate that an entire mapping contains the expected +// byte. +static int check_region_contains_byte(void *addr, unsigned long size, char byte) +{ + BUG_ON(size & (page_size - 1), + "check_region_contains_byte expects page multiples"); + BUG_ON((unsigned long)addr & (page_size - 1), + "check_region_contains_byte expects page alignment"); + + memset(page_buffer, byte, page_size); + + unsigned long num_pages = size / page_size; + unsigned long i; + + // Compare each page checking that it contains our expected byte. + for (i = 0; i < num_pages; ++i) { + int ret = + memcmp(addr + (i * page_size), page_buffer, page_size); + if (ret) { + return ret; + } + } + + return 0; +} + +// this test validates that MREMAP_DONTUNMAP moves the pagetables while leaving +// the source mapping mapped. +static void mremap_dontunmap_simple() +{ + unsigned long num_pages = 5; + + void *source_mapping = + mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + + memset(source_mapping, 'a', num_pages * page_size); + + // Try to just move the whole mapping anywhere (not fixed). + void *dest_mapping = + mremap(source_mapping, num_pages * page_size, num_pages * page_size, + MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); + BUG_ON(dest_mapping == MAP_FAILED, "mremap"); + + // Validate that the pages have been moved, we know they were moved if + // the dest_mapping contains a's. + BUG_ON(check_region_contains_byte + (dest_mapping, num_pages * page_size, 'a') != 0, + "pages did not migrate"); + BUG_ON(check_region_contains_byte + (source_mapping, num_pages * page_size, 0) != 0, + "source should have no ptes"); + + BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, + "unable to unmap destination mapping"); + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); +} + +// This test validates that MREMAP_DONTUNMAP on a shared mapping works as expected. +static void mremap_dontunmap_simple_shmem() +{ + unsigned long num_pages = 5; + + int mem_fd = memfd_create("memfd", MFD_CLOEXEC); + BUG_ON(mem_fd < 0, "memfd_create"); + + BUG_ON(ftruncate(mem_fd, num_pages * page_size) < 0, + "ftruncate"); + + void *source_mapping = + mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, + MAP_FILE | MAP_SHARED, mem_fd, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + + BUG_ON(close(mem_fd) < 0, "close"); + + memset(source_mapping, 'a', num_pages * page_size); + + // Try to just move the whole mapping anywhere (not fixed). + void *dest_mapping = + mremap(source_mapping, num_pages * page_size, num_pages * page_size, + MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); + if (dest_mapping == MAP_FAILED && errno == EINVAL) { + // Old kernel which doesn't support MREMAP_DONTUNMAP on shmem. + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); + return; + } + + BUG_ON(dest_mapping == MAP_FAILED, "mremap"); + + // Validate that the pages have been moved, we know they were moved if + // the dest_mapping contains a's. + BUG_ON(check_region_contains_byte + (dest_mapping, num_pages * page_size, 'a') != 0, + "pages did not migrate"); + + // Because the region is backed by shmem, we will actually see the same + // memory at the source location still. + BUG_ON(check_region_contains_byte + (source_mapping, num_pages * page_size, 'a') != 0, + "source should have no ptes"); + + BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, + "unable to unmap destination mapping"); + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); +} + +// This test validates MREMAP_DONTUNMAP will move page tables to a specific +// destination using MREMAP_FIXED, also while validating that the source +// remains intact. +static void mremap_dontunmap_simple_fixed() +{ + unsigned long num_pages = 5; + + // Since we want to guarantee that we can remap to a point, we will + // create a mapping up front. + void *dest_mapping = + mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(dest_mapping == MAP_FAILED, "mmap"); + memset(dest_mapping, 'X', num_pages * page_size); + + void *source_mapping = + mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + memset(source_mapping, 'a', num_pages * page_size); + + void *remapped_mapping = + mremap(source_mapping, num_pages * page_size, num_pages * page_size, + MREMAP_FIXED | MREMAP_DONTUNMAP | MREMAP_MAYMOVE, + dest_mapping); + BUG_ON(remapped_mapping == MAP_FAILED, "mremap"); + BUG_ON(remapped_mapping != dest_mapping, + "mremap should have placed the remapped mapping at dest_mapping"); + + // The dest mapping will have been unmap by mremap so we expect the Xs + // to be gone and replaced with a's. + BUG_ON(check_region_contains_byte + (dest_mapping, num_pages * page_size, 'a') != 0, + "pages did not migrate"); + + // And the source mapping will have had its ptes dropped. + BUG_ON(check_region_contains_byte + (source_mapping, num_pages * page_size, 0) != 0, + "source should have no ptes"); + + BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, + "unable to unmap destination mapping"); + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); +} + +// This test validates that we can MREMAP_DONTUNMAP for a portion of an +// existing mapping. +static void mremap_dontunmap_partial_mapping() +{ + /* + * source mapping: + * -------------- + * | aaaaaaaaaa | + * -------------- + * to become: + * -------------- + * | aaaaa00000 | + * -------------- + * With the destination mapping containing 5 pages of As. + * --------- + * | aaaaa | + * --------- + */ + unsigned long num_pages = 10; + void *source_mapping = + mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + memset(source_mapping, 'a', num_pages * page_size); + + // We will grab the last 5 pages of the source and move them. + void *dest_mapping = + mremap(source_mapping + (5 * page_size), 5 * page_size, + 5 * page_size, + MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); + BUG_ON(dest_mapping == MAP_FAILED, "mremap"); + + // We expect the first 5 pages of the source to contain a's and the + // final 5 pages to contain zeros. + BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 'a') != + 0, "first 5 pages of source should have original pages"); + BUG_ON(check_region_contains_byte + (source_mapping + (5 * page_size), 5 * page_size, 0) != 0, + "final 5 pages of source should have no ptes"); + + // Finally we expect the destination to have 5 pages worth of a's. + BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') != + 0, "dest mapping should contain ptes from the source"); + + BUG_ON(munmap(dest_mapping, 5 * page_size) == -1, + "unable to unmap destination mapping"); + BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, + "unable to unmap source mapping"); +} + +// This test validates that we can remap over only a portion of a mapping. +static void mremap_dontunmap_partial_mapping_overwrite(void) +{ + /* + * source mapping: + * --------- + * |aaaaa| + * --------- + * dest mapping initially: + * ----------- + * |XXXXXXXXXX| + * ------------ + * Source to become: + * --------- + * |00000| + * --------- + * With the destination mapping containing 5 pages of As. + * ------------ + * |aaaaaXXXXX| + * ------------ + */ + void *source_mapping = + mmap(NULL, 5 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(source_mapping == MAP_FAILED, "mmap"); + memset(source_mapping, 'a', 5 * page_size); + + void *dest_mapping = + mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(dest_mapping == MAP_FAILED, "mmap"); + memset(dest_mapping, 'X', 10 * page_size); + + // We will grab the last 5 pages of the source and move them. + void *remapped_mapping = + mremap(source_mapping, 5 * page_size, + 5 * page_size, + MREMAP_DONTUNMAP | MREMAP_MAYMOVE | MREMAP_FIXED, dest_mapping); + BUG_ON(dest_mapping == MAP_FAILED, "mremap"); + BUG_ON(dest_mapping != remapped_mapping, "expected to remap to dest_mapping"); + + BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 0) != + 0, "first 5 pages of source should have no ptes"); + + // Finally we expect the destination to have 5 pages worth of a's. + BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') != 0, + "dest mapping should contain ptes from the source"); + + // Finally the last 5 pages shouldn't have been touched. + BUG_ON(check_region_contains_byte(dest_mapping + (5 * page_size), + 5 * page_size, 'X') != 0, + "dest mapping should have retained the last 5 pages"); + + BUG_ON(munmap(dest_mapping, 10 * page_size) == -1, + "unable to unmap destination mapping"); + BUG_ON(munmap(source_mapping, 5 * page_size) == -1, + "unable to unmap source mapping"); +} + +int main(void) +{ + page_size = sysconf(_SC_PAGE_SIZE); + + // test for kernel support for MREMAP_DONTUNMAP skipping the test if + // not. + if (kernel_support_for_mremap_dontunmap() != 0) { + printf("No kernel support for MREMAP_DONTUNMAP\n"); + return KSFT_SKIP; + } + + // Keep a page sized buffer around for when we need it. + page_buffer = + mmap(NULL, page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + BUG_ON(page_buffer == MAP_FAILED, "unable to mmap a page."); + + mremap_dontunmap_simple(); + mremap_dontunmap_simple_shmem(); + mremap_dontunmap_simple_fixed(); + mremap_dontunmap_partial_mapping(); + mremap_dontunmap_partial_mapping_overwrite(); + + BUG_ON(munmap(page_buffer, page_size) == -1, + "unable to unmap page buffer"); + + printf("OK\n"); + return 0; +} --- /dev/null +++ a/tools/testing/selftests/mm/mremap_test.c @@ -0,0 +1,550 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2020 Google LLC + */ +#define _GNU_SOURCE + +#include <errno.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <sys/mman.h> +#include <time.h> +#include <stdbool.h> + +#include "../kselftest.h" + +#define EXPECT_SUCCESS 0 +#define EXPECT_FAILURE 1 +#define NON_OVERLAPPING 0 +#define OVERLAPPING 1 +#define NS_PER_SEC 1000000000ULL +#define VALIDATION_DEFAULT_THRESHOLD 4 /* 4MB */ +#define VALIDATION_NO_THRESHOLD 0 /* Verify the entire region */ + +#define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) + +struct config { + unsigned long long src_alignment; + unsigned long long dest_alignment; + unsigned long long region_size; + int overlapping; +}; + +struct test { + const char *name; + struct config config; + int expect_failure; +}; + +enum { + _1KB = 1ULL << 10, /* 1KB -> not page aligned */ + _4KB = 4ULL << 10, + _8KB = 8ULL << 10, + _1MB = 1ULL << 20, + _2MB = 2ULL << 20, + _4MB = 4ULL << 20, + _1GB = 1ULL << 30, + _2GB = 2ULL << 30, + PMD = _2MB, + PUD = _1GB, +}; + +#define PTE page_size + +#define MAKE_TEST(source_align, destination_align, size, \ + overlaps, should_fail, test_name) \ +(struct test){ \ + .name = test_name, \ + .config = { \ + .src_alignment = source_align, \ + .dest_alignment = destination_align, \ + .region_size = size, \ + .overlapping = overlaps, \ + }, \ + .expect_failure = should_fail \ +} + +/* + * Returns false if the requested remap region overlaps with an + * existing mapping (e.g text, stack) else returns true. + */ +static bool is_remap_region_valid(void *addr, unsigned long long size) +{ + void *remap_addr = NULL; + bool ret = true; + + /* Use MAP_FIXED_NOREPLACE flag to ensure region is not mapped */ + remap_addr = mmap(addr, size, PROT_READ | PROT_WRITE, + MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, + -1, 0); + + if (remap_addr == MAP_FAILED) { + if (errno == EEXIST) + ret = false; + } else { + munmap(remap_addr, size); + } + + return ret; +} + +/* Returns mmap_min_addr sysctl tunable from procfs */ +static unsigned long long get_mmap_min_addr(void) +{ + FILE *fp; + int n_matched; + static unsigned long long addr; + + if (addr) + return addr; + + fp = fopen("/proc/sys/vm/mmap_min_addr", "r"); + if (fp == NULL) { + ksft_print_msg("Failed to open /proc/sys/vm/mmap_min_addr: %s\n", + strerror(errno)); + exit(KSFT_SKIP); + } + + n_matched = fscanf(fp, "%llu", &addr); + if (n_matched != 1) { + ksft_print_msg("Failed to read /proc/sys/vm/mmap_min_addr: %s\n", + strerror(errno)); + fclose(fp); + exit(KSFT_SKIP); + } + + fclose(fp); + return addr; +} + +/* + * Using /proc/self/maps, assert that the specified address range is contained + * within a single mapping. + */ +static bool is_range_mapped(void *start, void *end) +{ + FILE *fp; + char *line = NULL; + size_t len = 0; + bool success = false; + + fp = fopen("/proc/self/maps", "r"); + if (fp == NULL) + return false; + + while (getline(&line, &len, fp) != -1) { + char *first = strtok(line, "- "); + void *first_val = (void *)strtol(first, NULL, 16); + char *second = strtok(NULL, "- "); + void *second_val = (void *) strtol(second, NULL, 16); + + if (first_val <= start && second_val >= end) { + success = true; + break; + } + } + + fclose(fp); + return success; +} + +/* + * This test validates that merge is called when expanding a mapping. + * Mapping containing three pages is created, middle page is unmapped + * and then the mapping containing the first page is expanded so that + * it fills the created hole. The two parts should merge creating + * single mapping with three pages. + */ +static void mremap_expand_merge(unsigned long page_size) +{ + char *test_name = "mremap expand merge"; + bool success = false; + int errsv = 0; + char *remap; + char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (start == MAP_FAILED) { + errsv = errno; + goto error; + } + + munmap(start + page_size, page_size); + remap = mremap(start, page_size, 2 * page_size, 0); + if (remap == MAP_FAILED) { + errsv = errno; + munmap(start, page_size); + munmap(start + 2 * page_size, page_size); + goto error; + } + + success = is_range_mapped(start, start + 3 * page_size); + + munmap(start, 3 * page_size); + goto out; + +error: + ksft_print_msg("Unexpected mapping/remapping error: %s\n", + strerror(errsv)); +out: + if (success) + ksft_test_result_pass("%s\n", test_name); + else + ksft_test_result_fail("%s\n", test_name); +} + +/* + * Similar to mremap_expand_merge() except instead of removing the middle page, + * we remove the last then attempt to remap offset from the second page. This + * should result in the mapping being restored to its former state. + */ +static void mremap_expand_merge_offset(unsigned long page_size) +{ + + char *test_name = "mremap expand merge offset"; + bool success = false; + int errsv = 0; + char *remap; + char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (start == MAP_FAILED) { + errsv = errno; + goto error; + } + + /* Unmap final page to ensure we have space to expand. */ + munmap(start + 2 * page_size, page_size); + remap = mremap(start + page_size, page_size, 2 * page_size, 0); + if (remap == MAP_FAILED) { + errsv = errno; + munmap(start, 2 * page_size); + goto error; + } + + success = is_range_mapped(start, start + 3 * page_size); + goto out; + +error: + ksft_print_msg("Unexpected mapping/remapping error: %s\n", + strerror(errsv)); +out: + if (success) + ksft_test_result_pass("%s\n", test_name); + else + ksft_test_result_fail("%s\n", test_name); +} + +/* + * Returns the start address of the mapping on success, else returns + * NULL on failure. + */ +static void *get_source_mapping(struct config c) +{ + unsigned long long addr = 0ULL; + void *src_addr = NULL; + unsigned long long mmap_min_addr; + + mmap_min_addr = get_mmap_min_addr(); + +retry: + addr += c.src_alignment; + if (addr < mmap_min_addr) + goto retry; + + src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE, + MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, + -1, 0); + if (src_addr == MAP_FAILED) { + if (errno == EPERM || errno == EEXIST) + goto retry; + goto error; + } + /* + * Check that the address is aligned to the specified alignment. + * Addresses which have alignments that are multiples of that + * specified are not considered valid. For instance, 1GB address is + * 2MB-aligned, however it will not be considered valid for a + * requested alignment of 2MB. This is done to reduce coincidental + * alignment in the tests. + */ + if (((unsigned long long) src_addr & (c.src_alignment - 1)) || + !((unsigned long long) src_addr & c.src_alignment)) { + munmap(src_addr, c.region_size); + goto retry; + } + + if (!src_addr) + goto error; + + return src_addr; +error: + ksft_print_msg("Failed to map source region: %s\n", + strerror(errno)); + return NULL; +} + +/* Returns the time taken for the remap on success else returns -1. */ +static long long remap_region(struct config c, unsigned int threshold_mb, + char pattern_seed) +{ + void *addr, *src_addr, *dest_addr; + unsigned long long i; + struct timespec t_start = {0, 0}, t_end = {0, 0}; + long long start_ns, end_ns, align_mask, ret, offset; + unsigned long long threshold; + + if (threshold_mb == VALIDATION_NO_THRESHOLD) + threshold = c.region_size; + else + threshold = MIN(threshold_mb * _1MB, c.region_size); + + src_addr = get_source_mapping(c); + if (!src_addr) { + ret = -1; + goto out; + } + + /* Set byte pattern */ + srand(pattern_seed); + for (i = 0; i < threshold; i++) + memset((char *) src_addr + i, (char) rand(), 1); + + /* Mask to zero out lower bits of address for alignment */ + align_mask = ~(c.dest_alignment - 1); + /* Offset of destination address from the end of the source region */ + offset = (c.overlapping) ? -c.dest_alignment : c.dest_alignment; + addr = (void *) (((unsigned long long) src_addr + c.region_size + + offset) & align_mask); + + /* See comment in get_source_mapping() */ + if (!((unsigned long long) addr & c.dest_alignment)) + addr = (void *) ((unsigned long long) addr | c.dest_alignment); + + /* Don't destroy existing mappings unless expected to overlap */ + while (!is_remap_region_valid(addr, c.region_size) && !c.overlapping) { + /* Check for unsigned overflow */ + if (addr + c.dest_alignment < addr) { + ksft_print_msg("Couldn't find a valid region to remap to\n"); + ret = -1; + goto out; + } + addr += c.dest_alignment; + } + + clock_gettime(CLOCK_MONOTONIC, &t_start); + dest_addr = mremap(src_addr, c.region_size, c.region_size, + MREMAP_MAYMOVE|MREMAP_FIXED, (char *) addr); + clock_gettime(CLOCK_MONOTONIC, &t_end); + + if (dest_addr == MAP_FAILED) { + ksft_print_msg("mremap failed: %s\n", strerror(errno)); + ret = -1; + goto clean_up_src; + } + + /* Verify byte pattern after remapping */ + srand(pattern_seed); + for (i = 0; i < threshold; i++) { + char c = (char) rand(); + + if (((char *) dest_addr)[i] != c) { + ksft_print_msg("Data after remap doesn't match at offset %d\n", + i); + ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff, + ((char *) dest_addr)[i] & 0xff); + ret = -1; + goto clean_up_dest; + } + } + + start_ns = t_start.tv_sec * NS_PER_SEC + t_start.tv_nsec; + end_ns = t_end.tv_sec * NS_PER_SEC + t_end.tv_nsec; + ret = end_ns - start_ns; + +/* + * Since the destination address is specified using MREMAP_FIXED, subsequent + * mremap will unmap any previous mapping at the address range specified by + * dest_addr and region_size. This significantly affects the remap time of + * subsequent tests. So we clean up mappings after each test. + */ +clean_up_dest: + munmap(dest_addr, c.region_size); +clean_up_src: + munmap(src_addr, c.region_size); +out: + return ret; +} + +static void run_mremap_test_case(struct test test_case, int *failures, + unsigned int threshold_mb, + unsigned int pattern_seed) +{ + long long remap_time = remap_region(test_case.config, threshold_mb, + pattern_seed); + + if (remap_time < 0) { + if (test_case.expect_failure) + ksft_test_result_xfail("%s\n\tExpected mremap failure\n", + test_case.name); + else { + ksft_test_result_fail("%s\n", test_case.name); + *failures += 1; + } + } else { + /* + * Comparing mremap time is only applicable if entire region + * was faulted in. + */ + if (threshold_mb == VALIDATION_NO_THRESHOLD || + test_case.config.region_size <= threshold_mb * _1MB) + ksft_test_result_pass("%s\n\tmremap time: %12lldns\n", + test_case.name, remap_time); + else + ksft_test_result_pass("%s\n", test_case.name); + } +} + +static void usage(const char *cmd) +{ + fprintf(stderr, + "Usage: %s [[-t <threshold_mb>] [-p <pattern_seed>]]\n" + "-t\t only validate threshold_mb of the remapped region\n" + " \t if 0 is supplied no threshold is used; all tests\n" + " \t are run and remapped regions validated fully.\n" + " \t The default threshold used is 4MB.\n" + "-p\t provide a seed to generate the random pattern for\n" + " \t validating the remapped region.\n", cmd); +} + +static int parse_args(int argc, char **argv, unsigned int *threshold_mb, + unsigned int *pattern_seed) +{ + const char *optstr = "t:p:"; + int opt; + + while ((opt = getopt(argc, argv, optstr)) != -1) { + switch (opt) { + case 't': + *threshold_mb = atoi(optarg); + break; + case 'p': + *pattern_seed = atoi(optarg); + break; + default: + usage(argv[0]); + return -1; + } + } + + if (optind < argc) { + usage(argv[0]); + return -1; + } + + return 0; +} + +#define MAX_TEST 13 +#define MAX_PERF_TEST 3 +int main(int argc, char **argv) +{ + int failures = 0; + int i, run_perf_tests; + unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; + unsigned int pattern_seed; + int num_expand_tests = 1; + struct test test_cases[MAX_TEST]; + struct test perf_test_cases[MAX_PERF_TEST]; + int page_size; + time_t t; + + pattern_seed = (unsigned int) time(&t); + + if (parse_args(argc, argv, &threshold_mb, &pattern_seed) < 0) + exit(EXIT_FAILURE); + + ksft_print_msg("Test configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n", + threshold_mb, pattern_seed); + + page_size = sysconf(_SC_PAGESIZE); + + /* Expected mremap failures */ + test_cases[0] = MAKE_TEST(page_size, page_size, page_size, + OVERLAPPING, EXPECT_FAILURE, + "mremap - Source and Destination Regions Overlapping"); + + test_cases[1] = MAKE_TEST(page_size, page_size/4, page_size, + NON_OVERLAPPING, EXPECT_FAILURE, + "mremap - Destination Address Misaligned (1KB-aligned)"); + test_cases[2] = MAKE_TEST(page_size/4, page_size, page_size, + NON_OVERLAPPING, EXPECT_FAILURE, + "mremap - Source Address Misaligned (1KB-aligned)"); + + /* Src addr PTE aligned */ + test_cases[3] = MAKE_TEST(PTE, PTE, PTE * 2, + NON_OVERLAPPING, EXPECT_SUCCESS, + "8KB mremap - Source PTE-aligned, Destination PTE-aligned"); + + /* Src addr 1MB aligned */ + test_cases[4] = MAKE_TEST(_1MB, PTE, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2MB mremap - Source 1MB-aligned, Destination PTE-aligned"); + test_cases[5] = MAKE_TEST(_1MB, _1MB, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2MB mremap - Source 1MB-aligned, Destination 1MB-aligned"); + + /* Src addr PMD aligned */ + test_cases[6] = MAKE_TEST(PMD, PTE, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "4MB mremap - Source PMD-aligned, Destination PTE-aligned"); + test_cases[7] = MAKE_TEST(PMD, _1MB, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "4MB mremap - Source PMD-aligned, Destination 1MB-aligned"); + test_cases[8] = MAKE_TEST(PMD, PMD, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, + "4MB mremap - Source PMD-aligned, Destination PMD-aligned"); + + /* Src addr PUD aligned */ + test_cases[9] = MAKE_TEST(PUD, PTE, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination PTE-aligned"); + test_cases[10] = MAKE_TEST(PUD, _1MB, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination 1MB-aligned"); + test_cases[11] = MAKE_TEST(PUD, PMD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination PMD-aligned"); + test_cases[12] = MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "2GB mremap - Source PUD-aligned, Destination PUD-aligned"); + + perf_test_cases[0] = MAKE_TEST(page_size, page_size, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "1GB mremap - Source PTE-aligned, Destination PTE-aligned"); + /* + * mremap 1GB region - Page table level aligned time + * comparison. + */ + perf_test_cases[1] = MAKE_TEST(PMD, PMD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "1GB mremap - Source PMD-aligned, Destination PMD-aligned"); + perf_test_cases[2] = MAKE_TEST(PUD, PUD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, + "1GB mremap - Source PUD-aligned, Destination PUD-aligned"); + + run_perf_tests = (threshold_mb == VALIDATION_NO_THRESHOLD) || + (threshold_mb * _1MB >= _1GB); + + ksft_set_plan(ARRAY_SIZE(test_cases) + (run_perf_tests ? + ARRAY_SIZE(perf_test_cases) : 0) + num_expand_tests); + + for (i = 0; i < ARRAY_SIZE(test_cases); i++) + run_mremap_test_case(test_cases[i], &failures, threshold_mb, + pattern_seed); + + mremap_expand_merge(page_size); + mremap_expand_merge_offset(page_size); + + if (run_perf_tests) { + ksft_print_msg("\n%s\n", + "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:"); + for (i = 0; i < ARRAY_SIZE(perf_test_cases); i++) + run_mremap_test_case(perf_test_cases[i], &failures, + threshold_mb, pattern_seed); + } + + if (failures > 0) + ksft_exit_fail(); + else + ksft_exit_pass(); +} --- /dev/null +++ a/tools/testing/selftests/mm/on-fault-limit.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <sys/mman.h> +#include <stdio.h> +#include <unistd.h> +#include <string.h> +#include <sys/time.h> +#include <sys/resource.h> + +#ifndef MCL_ONFAULT +#define MCL_ONFAULT (MCL_FUTURE << 1) +#endif + +static int test_limit(void) +{ + int ret = 1; + struct rlimit lims; + void *map; + + if (getrlimit(RLIMIT_MEMLOCK, &lims)) { + perror("getrlimit"); + return ret; + } + + if (mlockall(MCL_ONFAULT | MCL_FUTURE)) { + perror("mlockall"); + return ret; + } + + map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0); + if (map != MAP_FAILED) + printf("mmap should have failed, but didn't\n"); + else { + ret = 0; + munmap(map, 2 * lims.rlim_max); + } + + munlockall(); + return ret; +} + +int main(int argc, char **argv) +{ + int ret = 0; + + ret += test_limit(); + return ret; +} --- /dev/null +++ a/tools/testing/selftests/mm/pkey-helpers.h @@ -0,0 +1,226 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _PKEYS_HELPER_H +#define _PKEYS_HELPER_H +#define _GNU_SOURCE +#include <string.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdint.h> +#include <stdbool.h> +#include <signal.h> +#include <assert.h> +#include <stdlib.h> +#include <ucontext.h> +#include <sys/mman.h> + +#include "../kselftest.h" + +/* Define some kernel-like types */ +#define u8 __u8 +#define u16 __u16 +#define u32 __u32 +#define u64 __u64 + +#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) + +#ifndef DEBUG_LEVEL +#define DEBUG_LEVEL 0 +#endif +#define DPRINT_IN_SIGNAL_BUF_SIZE 4096 +extern int dprint_in_signal; +extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; + +extern int test_nr; +extern int iteration_nr; + +#ifdef __GNUC__ +__attribute__((format(printf, 1, 2))) +#endif +static inline void sigsafe_printf(const char *format, ...) +{ + va_list ap; + + if (!dprint_in_signal) { + va_start(ap, format); + vprintf(format, ap); + va_end(ap); + } else { + int ret; + /* + * No printf() functions are signal-safe. + * They deadlock easily. Write the format + * string to get some output, even if + * incomplete. + */ + ret = write(1, format, strlen(format)); + if (ret < 0) + exit(1); + } +} +#define dprintf_level(level, args...) do { \ + if (level <= DEBUG_LEVEL) \ + sigsafe_printf(args); \ +} while (0) +#define dprintf0(args...) dprintf_level(0, args) +#define dprintf1(args...) dprintf_level(1, args) +#define dprintf2(args...) dprintf_level(2, args) +#define dprintf3(args...) dprintf_level(3, args) +#define dprintf4(args...) dprintf_level(4, args) + +extern void abort_hooks(void); +#define pkey_assert(condition) do { \ + if (!(condition)) { \ + dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ + __FILE__, __LINE__, \ + test_nr, iteration_nr); \ + dprintf0("errno at assert: %d", errno); \ + abort_hooks(); \ + exit(__LINE__); \ + } \ +} while (0) + +__attribute__((noinline)) int read_ptr(int *ptr); +void expected_pkey_fault(int pkey); +int sys_pkey_alloc(unsigned long flags, unsigned long init_val); +int sys_pkey_free(unsigned long pkey); +int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey); +void record_pkey_malloc(void *ptr, long size, int prot); + +#if defined(__i386__) || defined(__x86_64__) /* arch */ +#include "pkey-x86.h" +#elif defined(__powerpc64__) /* arch */ +#include "pkey-powerpc.h" +#else /* arch */ +#error Architecture not supported +#endif /* arch */ + +#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) + +static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) +{ + u32 shift = pkey_bit_position(pkey); + /* mask out bits from pkey in old value */ + reg &= ~((u64)PKEY_MASK << shift); + /* OR in new bits for pkey */ + reg |= (flags & PKEY_MASK) << shift; + return reg; +} + +static inline u64 get_pkey_bits(u64 reg, int pkey) +{ + u32 shift = pkey_bit_position(pkey); + /* + * shift down the relevant bits to the lowest two, then + * mask off all the other higher bits + */ + return ((reg >> shift) & PKEY_MASK); +} + +extern u64 shadow_pkey_reg; + +static inline u64 _read_pkey_reg(int line) +{ + u64 pkey_reg = __read_pkey_reg(); + + dprintf4("read_pkey_reg(line=%d) pkey_reg: %016llx" + " shadow: %016llx\n", + line, pkey_reg, shadow_pkey_reg); + assert(pkey_reg == shadow_pkey_reg); + + return pkey_reg; +} + +#define read_pkey_reg() _read_pkey_reg(__LINE__) + +static inline void write_pkey_reg(u64 pkey_reg) +{ + dprintf4("%s() changing %016llx to %016llx\n", __func__, + __read_pkey_reg(), pkey_reg); + /* will do the shadow check for us: */ + read_pkey_reg(); + __write_pkey_reg(pkey_reg); + shadow_pkey_reg = pkey_reg; + dprintf4("%s(%016llx) pkey_reg: %016llx\n", __func__, + pkey_reg, __read_pkey_reg()); +} + +/* + * These are technically racy. since something could + * change PKEY register between the read and the write. + */ +static inline void __pkey_access_allow(int pkey, int do_allow) +{ + u64 pkey_reg = read_pkey_reg(); + int bit = pkey * 2; + + if (do_allow) + pkey_reg &= (1<<bit); + else + pkey_reg |= (1<<bit); + + dprintf4("pkey_reg now: %016llx\n", read_pkey_reg()); + write_pkey_reg(pkey_reg); +} + +static inline void __pkey_write_allow(int pkey, int do_allow_write) +{ + u64 pkey_reg = read_pkey_reg(); + int bit = pkey * 2 + 1; + + if (do_allow_write) + pkey_reg &= (1<<bit); + else + pkey_reg |= (1<<bit); + + write_pkey_reg(pkey_reg); + dprintf4("pkey_reg now: %016llx\n", read_pkey_reg()); +} + +#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) +#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1)) +#define ALIGN_PTR_UP(p, ptr_align_to) \ + ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) +#define ALIGN_PTR_DOWN(p, ptr_align_to) \ + ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to)) +#define __stringify_1(x...) #x +#define __stringify(x...) __stringify_1(x) + +static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si) +{ +#ifdef si_pkey + return &si->si_pkey; +#else + return (u32 *)(((u8 *)si) + si_pkey_offset); +#endif +} + +static inline int kernel_has_pkeys(void) +{ + /* try allocating a key and see if it succeeds */ + int ret = sys_pkey_alloc(0, 0); + if (ret <= 0) { + return 0; + } + sys_pkey_free(ret); + return 1; +} + +static inline int is_pkeys_supported(void) +{ + /* check if the cpu supports pkeys */ + if (!cpu_has_pkeys()) { + dprintf1("SKIP: %s: no CPU support\n", __func__); + return 0; + } + + /* check if the kernel supports pkeys */ + if (!kernel_has_pkeys()) { + dprintf1("SKIP: %s: no kernel support\n", __func__); + return 0; + } + + return 1; +} + +#endif /* _PKEYS_HELPER_H */ --- /dev/null +++ a/tools/testing/selftests/mm/pkey-powerpc.h @@ -0,0 +1,133 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _PKEYS_POWERPC_H +#define _PKEYS_POWERPC_H + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 386 +#endif +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 384 +# define SYS_pkey_free 385 +#endif +#define REG_IP_IDX PT_NIP +#define REG_TRAPNO PT_TRAP +#define gregs gp_regs +#define fpregs fp_regs +#define si_pkey_offset 0x20 + +#undef PKEY_DISABLE_ACCESS +#define PKEY_DISABLE_ACCESS 0x3 /* disable read and write */ + +#undef PKEY_DISABLE_WRITE +#define PKEY_DISABLE_WRITE 0x2 + +#define NR_PKEYS 32 +#define NR_RESERVED_PKEYS_4K 27 /* pkey-0, pkey-1, exec-only-pkey + and 24 other keys that cannot be + represented in the PTE */ +#define NR_RESERVED_PKEYS_64K_3KEYS 3 /* PowerNV and KVM: pkey-0, + pkey-1 and exec-only key */ +#define NR_RESERVED_PKEYS_64K_4KEYS 4 /* PowerVM: pkey-0, pkey-1, + pkey-31 and exec-only key */ +#define PKEY_BITS_PER_PKEY 2 +#define HPAGE_SIZE (1UL << 24) +#define PAGE_SIZE sysconf(_SC_PAGESIZE) + +static inline u32 pkey_bit_position(int pkey) +{ + return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY; +} + +static inline u64 __read_pkey_reg(void) +{ + u64 pkey_reg; + + asm volatile("mfspr %0, 0xd" : "=r" (pkey_reg)); + + return pkey_reg; +} + +static inline void __write_pkey_reg(u64 pkey_reg) +{ + u64 amr = pkey_reg; + + dprintf4("%s() changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); + + asm volatile("isync; mtspr 0xd, %0; isync" + : : "r" ((unsigned long)(amr)) : "memory"); + + dprintf4("%s() pkey register after changing %016llx to %016llx\n", + __func__, __read_pkey_reg(), pkey_reg); +} + +static inline int cpu_has_pkeys(void) +{ + /* No simple way to determine this */ + return 1; +} + +static inline bool arch_is_powervm() +{ + struct stat buf; + + if ((stat("/sys/firmware/devicetree/base/ibm,partition-name", &buf) == 0) && + (stat("/sys/firmware/devicetree/base/hmc-managed?", &buf) == 0) && + (stat("/sys/firmware/devicetree/base/chosen/qemu,graphic-width", &buf) == -1) ) + return true; + + return false; +} + +static inline int get_arch_reserved_keys(void) +{ + if (sysconf(_SC_PAGESIZE) == 4096) + return NR_RESERVED_PKEYS_4K; + else + if (arch_is_powervm()) + return NR_RESERVED_PKEYS_64K_4KEYS; + else + return NR_RESERVED_PKEYS_64K_3KEYS; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ + /* + * powerpc does not allow userspace to change permissions of exec-only + * keys since those keys are not allocated by userspace. The signal + * handler wont be able to reset the permissions, which means the code + * will infinitely continue to segfault here. + */ + return; +} + +/* 4-byte instructions * 16384 = 64K page */ +#define __page_o_noops() asm(".rept 16384 ; nop; .endr") + +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + void *ptr; + int ret; + + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + pkey_assert(pkey < NR_PKEYS); + ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + pkey_assert(ptr != (void *)-1); + + ret = syscall(__NR_subpage_prot, ptr, size, NULL); + if (ret) { + perror("subpage_perm"); + return PTR_ERR_ENOTSUP; + } + + ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); + pkey_assert(!ret); + record_pkey_malloc(ptr, size, prot); + + dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); + return ptr; +} + +#endif /* _PKEYS_POWERPC_H */ --- /dev/null +++ a/tools/testing/selftests/mm/pkey-x86.h @@ -0,0 +1,177 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _PKEYS_X86_H +#define _PKEYS_X86_H + +#ifdef __i386__ + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 380 +#endif + +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 381 +# define SYS_pkey_free 382 +#endif + +#define REG_IP_IDX REG_EIP +#define si_pkey_offset 0x14 + +#else + +#ifndef SYS_mprotect_key +# define SYS_mprotect_key 329 +#endif + +#ifndef SYS_pkey_alloc +# define SYS_pkey_alloc 330 +# define SYS_pkey_free 331 +#endif + +#define REG_IP_IDX REG_RIP +#define si_pkey_offset 0x20 + +#endif + +#ifndef PKEY_DISABLE_ACCESS +# define PKEY_DISABLE_ACCESS 0x1 +#endif + +#ifndef PKEY_DISABLE_WRITE +# define PKEY_DISABLE_WRITE 0x2 +#endif + +#define NR_PKEYS 16 +#define NR_RESERVED_PKEYS 2 /* pkey-0 and exec-only-pkey */ +#define PKEY_BITS_PER_PKEY 2 +#define HPAGE_SIZE (1UL<<21) +#define PAGE_SIZE 4096 +#define MB (1<<20) + +static inline void __page_o_noops(void) +{ + /* 8-bytes of instruction * 512 bytes = 1 page */ + asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr"); +} + +static inline u64 __read_pkey_reg(void) +{ + unsigned int eax, edx; + unsigned int ecx = 0; + unsigned pkey_reg; + + asm volatile(".byte 0x0f,0x01,0xee\n\t" + : "=a" (eax), "=d" (edx) + : "c" (ecx)); + pkey_reg = eax; + return pkey_reg; +} + +static inline void __write_pkey_reg(u64 pkey_reg) +{ + unsigned int eax = pkey_reg; + unsigned int ecx = 0; + unsigned int edx = 0; + + dprintf4("%s() changing %016llx to %016llx\n", __func__, + __read_pkey_reg(), pkey_reg); + asm volatile(".byte 0x0f,0x01,0xef\n\t" + : : "a" (eax), "c" (ecx), "d" (edx)); + assert(pkey_reg == __read_pkey_reg()); +} + +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */ +#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */ +#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ + +static inline int cpu_has_pkeys(void) +{ + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + + __cpuid_count(0x7, 0x0, eax, ebx, ecx, edx); + + if (!(ecx & X86_FEATURE_PKU)) { + dprintf2("cpu does not have PKU\n"); + return 0; + } + if (!(ecx & X86_FEATURE_OSPKE)) { + dprintf2("cpu does not have OSPKE\n"); + return 0; + } + return 1; +} + +static inline int cpu_max_xsave_size(void) +{ + unsigned long XSTATE_CPUID = 0xd; + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + + __cpuid_count(XSTATE_CPUID, 0, eax, ebx, ecx, edx); + return ecx; +} + +static inline u32 pkey_bit_position(int pkey) +{ + return pkey * PKEY_BITS_PER_PKEY; +} + +#define XSTATE_PKEY_BIT (9) +#define XSTATE_PKEY 0x200 +#define XSTATE_BV_OFFSET 512 + +int pkey_reg_xstate_offset(void) +{ + unsigned int eax; + unsigned int ebx; + unsigned int ecx; + unsigned int edx; + int xstate_offset; + int xstate_size; + unsigned long XSTATE_CPUID = 0xd; + int leaf; + + /* assume that XSTATE_PKEY is set in XCR0 */ + leaf = XSTATE_PKEY_BIT; + { + __cpuid_count(XSTATE_CPUID, leaf, eax, ebx, ecx, edx); + + if (leaf == XSTATE_PKEY_BIT) { + xstate_offset = ebx; + xstate_size = eax; + } + } + + if (xstate_size == 0) { + printf("could not find size/offset of PKEY in xsave state\n"); + return 0; + } + + return xstate_offset; +} + +static inline int get_arch_reserved_keys(void) +{ + return NR_RESERVED_PKEYS; +} + +void expect_fault_on_read_execonly_key(void *p1, int pkey) +{ + int ptr_contents; + + ptr_contents = read_ptr(p1); + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); + expected_pkey_fault(pkey); +} + +void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) +{ + return PTR_ERR_ENOTSUP; +} + +#endif /* _PKEYS_X86_H */ --- /dev/null +++ a/tools/testing/selftests/mm/protection_keys.c @@ -0,0 +1,1788 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst) + * + * There are examples in here of: + * * how to set protection keys on memory + * * how to set/clear bits in pkey registers (the rights register) + * * how to handle SEGV_PKUERR signals and extract pkey-relevant + * information from the siginfo + * + * Things to add: + * make sure KSM and KSM COW breaking works + * prefault pages in at malloc, or not + * protect MPX bounds tables with protection keys? + * make sure VMA splitting/merging is working correctly + * OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys + * look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel + * do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks + * + * Compile like this: + * gcc -mxsave -o protection_keys -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm + * gcc -mxsave -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm + */ +#define _GNU_SOURCE +#define __SANE_USERSPACE_TYPES__ +#include <errno.h> +#include <linux/elf.h> +#include <linux/futex.h> +#include <time.h> +#include <sys/time.h> +#include <sys/syscall.h> +#include <string.h> +#include <stdio.h> +#include <stdint.h> +#include <stdbool.h> +#include <signal.h> +#include <assert.h> +#include <stdlib.h> +#include <ucontext.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <sys/ptrace.h> +#include <setjmp.h> + +#include "pkey-helpers.h" + +int iteration_nr = 1; +int test_nr; + +u64 shadow_pkey_reg; +int dprint_in_signal; +char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; + +void cat_into_file(char *str, char *file) +{ + int fd = open(file, O_RDWR); + int ret; + + dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file); + /* + * these need to be raw because they are called under + * pkey_assert() + */ + if (fd < 0) { + fprintf(stderr, "error opening '%s'\n", str); + perror("error: "); + exit(__LINE__); + } + + ret = write(fd, str, strlen(str)); + if (ret != strlen(str)) { + perror("write to file failed"); + fprintf(stderr, "filename: '%s' str: '%s'\n", file, str); + exit(__LINE__); + } + close(fd); +} + +#if CONTROL_TRACING > 0 +static int warned_tracing; +int tracing_root_ok(void) +{ + if (geteuid() != 0) { + if (!warned_tracing) + fprintf(stderr, "WARNING: not run as root, " + "can not do tracing control\n"); + warned_tracing = 1; + return 0; + } + return 1; +} +#endif + +void tracing_on(void) +{ +#if CONTROL_TRACING > 0 +#define TRACEDIR "/sys/kernel/debug/tracing" + char pidstr[32]; + + if (!tracing_root_ok()) + return; + + sprintf(pidstr, "%d", getpid()); + cat_into_file("0", TRACEDIR "/tracing_on"); + cat_into_file("\n", TRACEDIR "/trace"); + if (1) { + cat_into_file("function_graph", TRACEDIR "/current_tracer"); + cat_into_file("1", TRACEDIR "/options/funcgraph-proc"); + } else { + cat_into_file("nop", TRACEDIR "/current_tracer"); + } + cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid"); + cat_into_file("1", TRACEDIR "/tracing_on"); + dprintf1("enabled tracing\n"); +#endif +} + +void tracing_off(void) +{ +#if CONTROL_TRACING > 0 + if (!tracing_root_ok()) + return; + cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on"); +#endif +} + +void abort_hooks(void) +{ + fprintf(stderr, "running %s()...\n", __func__); + tracing_off(); +#ifdef SLEEP_ON_ABORT + sleep(SLEEP_ON_ABORT); +#endif +} + +/* + * This attempts to have roughly a page of instructions followed by a few + * instructions that do a write, and another page of instructions. That + * way, we are pretty sure that the write is in the second page of + * instructions and has at least a page of padding behind it. + * + * *That* lets us be sure to madvise() away the write instruction, which + * will then fault, which makes sure that the fault code handles + * execute-only memory properly. + */ +#ifdef __powerpc64__ +/* This way, both 4K and 64K alignment are maintained */ +__attribute__((__aligned__(65536))) +#else +__attribute__((__aligned__(PAGE_SIZE))) +#endif +void lots_o_noops_around_write(int *write_to_me) +{ + dprintf3("running %s()\n", __func__); + __page_o_noops(); + /* Assume this happens in the second page of instructions: */ + *write_to_me = __LINE__; + /* pad out by another page: */ + __page_o_noops(); + dprintf3("%s() done\n", __func__); +} + +void dump_mem(void *dumpme, int len_bytes) +{ + char *c = (void *)dumpme; + int i; + + for (i = 0; i < len_bytes; i += sizeof(u64)) { + u64 *ptr = (u64 *)(c + i); + dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr); + } +} + +static u32 hw_pkey_get(int pkey, unsigned long flags) +{ + u64 pkey_reg = __read_pkey_reg(); + + dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", + __func__, pkey, flags, 0, 0); + dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg); + + return (u32) get_pkey_bits(pkey_reg, pkey); +} + +static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) +{ + u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); + u64 old_pkey_reg = __read_pkey_reg(); + u64 new_pkey_reg; + + /* make sure that 'rights' only contains the bits we expect: */ + assert(!(rights & ~mask)); + + /* modify bits accordingly in old pkey_reg and assign it */ + new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights); + + __write_pkey_reg(new_pkey_reg); + + dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x" + " pkey_reg now: %016llx old_pkey_reg: %016llx\n", + __func__, pkey, rights, flags, 0, __read_pkey_reg(), + old_pkey_reg); + return 0; +} + +void pkey_disable_set(int pkey, int flags) +{ + unsigned long syscall_flags = 0; + int ret; + int pkey_rights; + u64 orig_pkey_reg = read_pkey_reg(); + + dprintf1("START->%s(%d, 0x%x)\n", __func__, + pkey, flags); + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + pkey_assert(pkey_rights >= 0); + + pkey_rights |= flags; + + ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); + assert(!ret); + /* pkey_reg and flags have the same format */ + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); + dprintf1("%s(%d) shadow: 0x%016llx\n", + __func__, pkey, shadow_pkey_reg); + + pkey_assert(ret >= 0); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + dprintf1("%s(%d) pkey_reg: 0x%016llx\n", + __func__, pkey, read_pkey_reg()); + if (flags) + pkey_assert(read_pkey_reg() >= orig_pkey_reg); + dprintf1("END<---%s(%d, 0x%x)\n", __func__, + pkey, flags); +} + +void pkey_disable_clear(int pkey, int flags) +{ + unsigned long syscall_flags = 0; + int ret; + int pkey_rights = hw_pkey_get(pkey, syscall_flags); + u64 orig_pkey_reg = read_pkey_reg(); + + pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); + + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + pkey_assert(pkey_rights >= 0); + + pkey_rights &= ~flags; + + ret = hw_pkey_set(pkey, pkey_rights, 0); + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); + pkey_assert(ret >= 0); + + pkey_rights = hw_pkey_get(pkey, syscall_flags); + dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, + pkey, pkey, pkey_rights); + + dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, + pkey, read_pkey_reg()); + if (flags) + assert(read_pkey_reg() <= orig_pkey_reg); +} + +void pkey_write_allow(int pkey) +{ + pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); +} +void pkey_write_deny(int pkey) +{ + pkey_disable_set(pkey, PKEY_DISABLE_WRITE); +} +void pkey_access_allow(int pkey) +{ + pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); +} +void pkey_access_deny(int pkey) +{ + pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); +} + +/* Failed address bound checks: */ +#ifndef SEGV_BNDERR +# define SEGV_BNDERR 3 +#endif + +#ifndef SEGV_PKUERR +# define SEGV_PKUERR 4 +#endif + +static char *si_code_str(int si_code) +{ + if (si_code == SEGV_MAPERR) + return "SEGV_MAPERR"; + if (si_code == SEGV_ACCERR) + return "SEGV_ACCERR"; + if (si_code == SEGV_BNDERR) + return "SEGV_BNDERR"; + if (si_code == SEGV_PKUERR) + return "SEGV_PKUERR"; + return "UNKNOWN"; +} + +int pkey_faults; +int last_si_pkey = -1; +void signal_handler(int signum, siginfo_t *si, void *vucontext) +{ + ucontext_t *uctxt = vucontext; + int trapno; + unsigned long ip; + char *fpregs; +#if defined(__i386__) || defined(__x86_64__) /* arch */ + u32 *pkey_reg_ptr; + int pkey_reg_offset; +#endif /* arch */ + u64 siginfo_pkey; + u32 *si_pkey_ptr; + + dprint_in_signal = 1; + dprintf1(">>>>===============SIGSEGV============================\n"); + dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", + __func__, __LINE__, + __read_pkey_reg(), shadow_pkey_reg); + + trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; + ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; + fpregs = (char *) uctxt->uc_mcontext.fpregs; + + dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n", + __func__, trapno, ip, si_code_str(si->si_code), + si->si_code); + +#if defined(__i386__) || defined(__x86_64__) /* arch */ +#ifdef __i386__ + /* + * 32-bit has some extra padding so that userspace can tell whether + * the XSTATE header is present in addition to the "legacy" FPU + * state. We just assume that it is here. + */ + fpregs += 0x70; +#endif /* i386 */ + pkey_reg_offset = pkey_reg_xstate_offset(); + pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]); + + /* + * If we got a PKEY fault, we *HAVE* to have at least one bit set in + * here. + */ + dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset()); + if (DEBUG_LEVEL > 4) + dump_mem(pkey_reg_ptr - 128, 256); + pkey_assert(*pkey_reg_ptr); +#endif /* arch */ + + dprintf1("siginfo: %p\n", si); + dprintf1(" fpregs: %p\n", fpregs); + + if ((si->si_code == SEGV_MAPERR) || + (si->si_code == SEGV_ACCERR) || + (si->si_code == SEGV_BNDERR)) { + printf("non-PK si_code, exiting...\n"); + exit(4); + } + + si_pkey_ptr = siginfo_get_pkey_ptr(si); + dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr); + dump_mem((u8 *)si_pkey_ptr - 8, 24); + siginfo_pkey = *si_pkey_ptr; + pkey_assert(siginfo_pkey < NR_PKEYS); + last_si_pkey = siginfo_pkey; + + /* + * need __read_pkey_reg() version so we do not do shadow_pkey_reg + * checking + */ + dprintf1("signal pkey_reg from pkey_reg: %016llx\n", + __read_pkey_reg()); + dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey); +#if defined(__i386__) || defined(__x86_64__) /* arch */ + dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr); + *(u64 *)pkey_reg_ptr = 0x00000000; + dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n"); +#elif defined(__powerpc64__) /* arch */ + /* restore access and let the faulting instruction continue */ + pkey_access_allow(siginfo_pkey); +#endif /* arch */ + pkey_faults++; + dprintf1("<<<<==================================================\n"); + dprint_in_signal = 0; +} + +int wait_all_children(void) +{ + int status; + return waitpid(-1, &status, 0); +} + +void sig_chld(int x) +{ + dprint_in_signal = 1; + dprintf2("[%d] SIGCHLD: %d\n", getpid(), x); + dprint_in_signal = 0; +} + +void setup_sigsegv_handler(void) +{ + int r, rs; + struct sigaction newact; + struct sigaction oldact; + + /* #PF is mapped to sigsegv */ + int signum = SIGSEGV; + + newact.sa_handler = 0; + newact.sa_sigaction = signal_handler; + + /*sigset_t - signals to block while in the handler */ + /* get the old signal mask. */ + rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask); + pkey_assert(rs == 0); + + /* call sa_sigaction, not sa_handler*/ + newact.sa_flags = SA_SIGINFO; + + newact.sa_restorer = 0; /* void(*)(), obsolete */ + r = sigaction(signum, &newact, &oldact); + r = sigaction(SIGALRM, &newact, &oldact); + pkey_assert(r == 0); +} + +void setup_handlers(void) +{ + signal(SIGCHLD, &sig_chld); + setup_sigsegv_handler(); +} + +pid_t fork_lazy_child(void) +{ + pid_t forkret; + + forkret = fork(); + pkey_assert(forkret >= 0); + dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); + + if (!forkret) { + /* in the child */ + while (1) { + dprintf1("child sleeping...\n"); + sleep(30); + } + } + return forkret; +} + +int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey) +{ + int sret; + + dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__, + ptr, size, orig_prot, pkey); + + errno = 0; + sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey); + if (errno) { + dprintf2("SYS_mprotect_key sret: %d\n", sret); + dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot); + dprintf2("SYS_mprotect_key failed, errno: %d\n", errno); + if (DEBUG_LEVEL >= 2) + perror("SYS_mprotect_pkey"); + } + return sret; +} + +int sys_pkey_alloc(unsigned long flags, unsigned long init_val) +{ + int ret = syscall(SYS_pkey_alloc, flags, init_val); + dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n", + __func__, flags, init_val, ret, errno); + return ret; +} + +int alloc_pkey(void) +{ + int ret; + unsigned long init_val = 0x0; + + dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", + __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); + ret = sys_pkey_alloc(0, init_val); + /* + * pkey_alloc() sets PKEY register, so we need to reflect it in + * shadow_pkey_reg: + */ + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + if (ret > 0) { + /* clear both the bits: */ + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, + ~PKEY_MASK); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, + __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + /* + * move the new state in from init_val + * (remember, we cheated and init_val == pkey_reg format) + */ + shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, + init_val); + } + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno); + /* for shadow checking: */ + read_pkey_reg(); + dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + return ret; +} + +int sys_pkey_free(unsigned long pkey) +{ + int ret = syscall(SYS_pkey_free, pkey); + dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret); + return ret; +} + +/* + * I had a bug where pkey bits could be set by mprotect() but + * not cleared. This ensures we get lots of random bit sets + * and clears on the vma and pte pkey bits. + */ +int alloc_random_pkey(void) +{ + int max_nr_pkey_allocs; + int ret; + int i; + int alloced_pkeys[NR_PKEYS]; + int nr_alloced = 0; + int random_index; + memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); + + /* allocate every possible key and make a note of which ones we got */ + max_nr_pkey_allocs = NR_PKEYS; + for (i = 0; i < max_nr_pkey_allocs; i++) { + int new_pkey = alloc_pkey(); + if (new_pkey < 0) + break; + alloced_pkeys[nr_alloced++] = new_pkey; + } + + pkey_assert(nr_alloced > 0); + /* select a random one out of the allocated ones */ + random_index = rand() % nr_alloced; + ret = alloced_pkeys[random_index]; + /* now zero it out so we don't free it next */ + alloced_pkeys[random_index] = 0; + + /* go through the allocated ones that we did not want and free them */ + for (i = 0; i < nr_alloced; i++) { + int free_ret; + if (!alloced_pkeys[i]) + continue; + free_ret = sys_pkey_free(alloced_pkeys[i]); + pkey_assert(!free_ret); + } + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, + __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); + return ret; +} + +int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, + unsigned long pkey) +{ + int nr_iterations = random() % 100; + int ret; + + while (0) { + int rpkey = alloc_random_pkey(); + ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); + dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", + ptr, size, orig_prot, pkey, ret); + if (nr_iterations-- < 0) + break; + + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + sys_pkey_free(rpkey); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, ret, __read_pkey_reg(), + shadow_pkey_reg); + } + pkey_assert(pkey < NR_PKEYS); + + ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); + dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", + ptr, size, orig_prot, pkey, ret); + pkey_assert(!ret); + dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", __func__, + __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); + return ret; +} + +struct pkey_malloc_record { + void *ptr; + long size; + int prot; +}; +struct pkey_malloc_record *pkey_malloc_records; +struct pkey_malloc_record *pkey_last_malloc_record; +long nr_pkey_malloc_records; +void record_pkey_malloc(void *ptr, long size, int prot) +{ + long i; + struct pkey_malloc_record *rec = NULL; + + for (i = 0; i < nr_pkey_malloc_records; i++) { + rec = &pkey_malloc_records[i]; + /* find a free record */ + if (rec) + break; + } + if (!rec) { + /* every record is full */ + size_t old_nr_records = nr_pkey_malloc_records; + size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1); + size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record); + dprintf2("new_nr_records: %zd\n", new_nr_records); + dprintf2("new_size: %zd\n", new_size); + pkey_malloc_records = realloc(pkey_malloc_records, new_size); + pkey_assert(pkey_malloc_records != NULL); + rec = &pkey_malloc_records[nr_pkey_malloc_records]; + /* + * realloc() does not initialize memory, so zero it from + * the first new record all the way to the end. + */ + for (i = 0; i < new_nr_records - old_nr_records; i++) + memset(rec + i, 0, sizeof(*rec)); + } + dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n", + (int)(rec - pkey_malloc_records), rec, ptr, size); + rec->ptr = ptr; + rec->size = size; + rec->prot = prot; + pkey_last_malloc_record = rec; + nr_pkey_malloc_records++; +} + +void free_pkey_malloc(void *ptr) +{ + long i; + int ret; + dprintf3("%s(%p)\n", __func__, ptr); + for (i = 0; i < nr_pkey_malloc_records; i++) { + struct pkey_malloc_record *rec = &pkey_malloc_records[i]; + dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n", + ptr, i, rec, rec->ptr, rec->size); + if ((ptr < rec->ptr) || + (ptr >= rec->ptr + rec->size)) + continue; + + dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n", + ptr, i, rec, rec->ptr, rec->size); + nr_pkey_malloc_records--; + ret = munmap(rec->ptr, rec->size); + dprintf3("munmap ret: %d\n", ret); + pkey_assert(!ret); + dprintf3("clearing rec->ptr, rec: %p\n", rec); + rec->ptr = NULL; + dprintf3("done clearing rec->ptr, rec: %p\n", rec); + return; + } + pkey_assert(false); +} + + +void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) +{ + void *ptr; + int ret; + + read_pkey_reg(); + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + pkey_assert(pkey < NR_PKEYS); + ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + pkey_assert(ptr != (void *)-1); + ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); + pkey_assert(!ret); + record_pkey_malloc(ptr, size, prot); + read_pkey_reg(); + + dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); + return ptr; +} + +void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) +{ + int ret; + void *ptr; + + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + /* + * Guarantee we can fit at least one huge page in the resulting + * allocation by allocating space for 2: + */ + size = ALIGN_UP(size, HPAGE_SIZE * 2); + ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + pkey_assert(ptr != (void *)-1); + record_pkey_malloc(ptr, size, prot); + mprotect_pkey(ptr, size, prot, pkey); + + dprintf1("unaligned ptr: %p\n", ptr); + ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE); + dprintf1(" aligned ptr: %p\n", ptr); + ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE); + dprintf1("MADV_HUGEPAGE ret: %d\n", ret); + ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED); + dprintf1("MADV_WILLNEED ret: %d\n", ret); + memset(ptr, 0, HPAGE_SIZE); + + dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr); + return ptr; +} + +int hugetlb_setup_ok; +#define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages" +#define GET_NR_HUGE_PAGES 10 +void setup_hugetlbfs(void) +{ + int err; + int fd; + char buf[256]; + long hpagesz_kb; + long hpagesz_mb; + + if (geteuid() != 0) { + fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n"); + return; + } + + cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages"); + + /* + * Now go make sure that we got the pages and that they + * are PMD-level pages. Someone might have made PUD-level + * pages the default. + */ + hpagesz_kb = HPAGE_SIZE / 1024; + hpagesz_mb = hpagesz_kb / 1024; + sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb); + fd = open(buf, O_RDONLY); + if (fd < 0) { + fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n", + hpagesz_mb, strerror(errno)); + return; + } + + /* -1 to guarantee leaving the trailing \0 */ + err = read(fd, buf, sizeof(buf)-1); + close(fd); + if (err <= 0) { + fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n", + hpagesz_mb, strerror(errno)); + return; + } + + if (atoi(buf) != GET_NR_HUGE_PAGES) { + fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n", + hpagesz_mb, buf, GET_NR_HUGE_PAGES); + return; + } + + hugetlb_setup_ok = 1; +} + +void *malloc_pkey_hugetlb(long size, int prot, u16 pkey) +{ + void *ptr; + int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB; + + if (!hugetlb_setup_ok) + return PTR_ERR_ENOTSUP; + + dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey); + size = ALIGN_UP(size, HPAGE_SIZE * 2); + pkey_assert(pkey < NR_PKEYS); + ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0); + pkey_assert(ptr != (void *)-1); + mprotect_pkey(ptr, size, prot, pkey); + + record_pkey_malloc(ptr, size, prot); + + dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr); + return ptr; +} + +void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey) +{ + void *ptr; + int fd; + + dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, + size, prot, pkey); + pkey_assert(pkey < NR_PKEYS); + fd = open("/dax/foo", O_RDWR); + pkey_assert(fd >= 0); + + ptr = mmap(0, size, prot, MAP_SHARED, fd, 0); + pkey_assert(ptr != (void *)-1); + + mprotect_pkey(ptr, size, prot, pkey); + + record_pkey_malloc(ptr, size, prot); + + dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr); + close(fd); + return ptr; +} + +void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { + + malloc_pkey_with_mprotect, + malloc_pkey_with_mprotect_subpage, + malloc_pkey_anon_huge, + malloc_pkey_hugetlb +/* can not do direct with the pkey_mprotect() API: + malloc_pkey_mmap_direct, + malloc_pkey_mmap_dax, +*/ +}; + +void *malloc_pkey(long size, int prot, u16 pkey) +{ + void *ret; + static int malloc_type; + int nr_malloc_types = ARRAY_SIZE(pkey_malloc); + + pkey_assert(pkey < NR_PKEYS); + + while (1) { + pkey_assert(malloc_type < nr_malloc_types); + + ret = pkey_malloc[malloc_type](size, prot, pkey); + pkey_assert(ret != (void *)-1); + + malloc_type++; + if (malloc_type >= nr_malloc_types) + malloc_type = (random()%nr_malloc_types); + + /* try again if the malloc_type we tried is unsupported */ + if (ret == PTR_ERR_ENOTSUP) + continue; + + break; + } + + dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__, + size, prot, pkey, ret); + return ret; +} + +int last_pkey_faults; +#define UNKNOWN_PKEY -2 +void expected_pkey_fault(int pkey) +{ + dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n", + __func__, last_pkey_faults, pkey_faults); + dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey); + pkey_assert(last_pkey_faults + 1 == pkey_faults); + + /* + * For exec-only memory, we do not know the pkey in + * advance, so skip this check. + */ + if (pkey != UNKNOWN_PKEY) + pkey_assert(last_si_pkey == pkey); + +#if defined(__i386__) || defined(__x86_64__) /* arch */ + /* + * The signal handler shold have cleared out PKEY register to let the + * test program continue. We now have to restore it. + */ + if (__read_pkey_reg() != 0) +#else /* arch */ + if (__read_pkey_reg() != shadow_pkey_reg) +#endif /* arch */ + pkey_assert(0); + + __write_pkey_reg(shadow_pkey_reg); + dprintf1("%s() set pkey_reg=%016llx to restore state after signal " + "nuked it\n", __func__, shadow_pkey_reg); + last_pkey_faults = pkey_faults; + last_si_pkey = -1; +} + +#define do_not_expect_pkey_fault(msg) do { \ + if (last_pkey_faults != pkey_faults) \ + dprintf0("unexpected PKey fault: %s\n", msg); \ + pkey_assert(last_pkey_faults == pkey_faults); \ +} while (0) + +int test_fds[10] = { -1 }; +int nr_test_fds; +void __save_test_fd(int fd) +{ + pkey_assert(fd >= 0); + pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds)); + test_fds[nr_test_fds] = fd; + nr_test_fds++; +} + +int get_test_read_fd(void) +{ + int test_fd = open("/etc/passwd", O_RDONLY); + __save_test_fd(test_fd); + return test_fd; +} + +void close_test_fds(void) +{ + int i; + + for (i = 0; i < nr_test_fds; i++) { + if (test_fds[i] < 0) + continue; + close(test_fds[i]); + test_fds[i] = -1; + } + nr_test_fds = 0; +} + +#define barrier() __asm__ __volatile__("": : :"memory") +__attribute__((noinline)) int read_ptr(int *ptr) +{ + /* + * Keep GCC from optimizing this away somehow + */ + barrier(); + return *ptr; +} + +void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) +{ + int i, err; + int max_nr_pkey_allocs; + int alloced_pkeys[NR_PKEYS]; + int nr_alloced = 0; + long size; + + pkey_assert(pkey_last_malloc_record); + size = pkey_last_malloc_record->size; + /* + * This is a bit of a hack. But mprotect() requires + * huge-page-aligned sizes when operating on hugetlbfs. + * So, make sure that we use something that's a multiple + * of a huge page when we can. + */ + if (size >= HPAGE_SIZE) + size = HPAGE_SIZE; + + /* allocate every possible key and make sure key-0 never got allocated */ + max_nr_pkey_allocs = NR_PKEYS; + for (i = 0; i < max_nr_pkey_allocs; i++) { + int new_pkey = alloc_pkey(); + pkey_assert(new_pkey != 0); + + if (new_pkey < 0) + break; + alloced_pkeys[nr_alloced++] = new_pkey; + } + /* free all the allocated keys */ + for (i = 0; i < nr_alloced; i++) { + int free_ret; + + if (!alloced_pkeys[i]) + continue; + free_ret = sys_pkey_free(alloced_pkeys[i]); + pkey_assert(!free_ret); + } + + /* attach key-0 in various modes */ + err = sys_mprotect_pkey(ptr, size, PROT_READ, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0); + pkey_assert(!err); + err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0); + pkey_assert(!err); +} + +void test_read_of_write_disabled_region(int *ptr, u16 pkey) +{ + int ptr_contents; + + dprintf1("disabling write access to PKEY[1], doing read\n"); + pkey_write_deny(pkey); + ptr_contents = read_ptr(ptr); + dprintf1("*ptr: %d\n", ptr_contents); + dprintf1("\n"); +} +void test_read_of_access_disabled_region(int *ptr, u16 pkey) +{ + int ptr_contents; + + dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr); + read_pkey_reg(); + pkey_access_deny(pkey); + ptr_contents = read_ptr(ptr); + dprintf1("*ptr: %d\n", ptr_contents); + expected_pkey_fault(pkey); +} + +void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + int ptr_contents; + + dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", + pkey, ptr); + ptr_contents = read_ptr(ptr); + dprintf1("reading ptr before disabling the read : %d\n", + ptr_contents); + read_pkey_reg(); + pkey_access_deny(pkey); + ptr_contents = read_ptr(ptr); + dprintf1("*ptr: %d\n", ptr_contents); + expected_pkey_fault(pkey); +} + +void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + *ptr = __LINE__; + dprintf1("disabling write access; after accessing the page, " + "to PKEY[%02d], doing write\n", pkey); + pkey_write_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + +void test_write_of_write_disabled_region(int *ptr, u16 pkey) +{ + dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); + pkey_write_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} +void test_write_of_access_disabled_region(int *ptr, u16 pkey) +{ + dprintf1("disabling access to PKEY[%02d], doing write\n", pkey); + pkey_access_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + +void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, + u16 pkey) +{ + *ptr = __LINE__; + dprintf1("disabling access; after accessing the page, " + " to PKEY[%02d], doing write\n", pkey); + pkey_access_deny(pkey); + *ptr = __LINE__; + expected_pkey_fault(pkey); +} + +void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) +{ + int ret; + int test_fd = get_test_read_fd(); + + dprintf1("disabling access to PKEY[%02d], " + "having kernel read() to buffer\n", pkey); + pkey_access_deny(pkey); + ret = read(test_fd, ptr, 1); + dprintf1("read ret: %d\n", ret); + pkey_assert(ret); +} +void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey) +{ + int ret; + int test_fd = get_test_read_fd(); + + pkey_write_deny(pkey); + ret = read(test_fd, ptr, 100); + dprintf1("read ret: %d\n", ret); + if (ret < 0 && (DEBUG_LEVEL > 0)) + perror("verbose read result (OK for this to be bad)"); + pkey_assert(ret); +} + +void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey) +{ + int pipe_ret, vmsplice_ret; + struct iovec iov; + int pipe_fds[2]; + + pipe_ret = pipe(pipe_fds); + + pkey_assert(pipe_ret == 0); + dprintf1("disabling access to PKEY[%02d], " + "having kernel vmsplice from buffer\n", pkey); + pkey_access_deny(pkey); + iov.iov_base = ptr; + iov.iov_len = PAGE_SIZE; + vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT); + dprintf1("vmsplice() ret: %d\n", vmsplice_ret); + pkey_assert(vmsplice_ret == -1); + + close(pipe_fds[0]); + close(pipe_fds[1]); +} + +void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey) +{ + int ignored = 0xdada; + int futex_ret; + int some_int = __LINE__; + + dprintf1("disabling write to PKEY[%02d], " + "doing futex gunk in buffer\n", pkey); + *ptr = some_int; + pkey_write_deny(pkey); + futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL, + &ignored, ignored); + if (DEBUG_LEVEL > 0) + perror("futex"); + dprintf1("futex() ret: %d\n", futex_ret); +} + +/* Assumes that all pkeys other than 'pkey' are unallocated */ +void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey) +{ + int err; + int i; + + /* Note: 0 is the default pkey, so don't mess with it */ + for (i = 1; i < NR_PKEYS; i++) { + if (pkey == i) + continue; + + dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i); + err = sys_pkey_free(i); + pkey_assert(err); + + err = sys_pkey_free(i); + pkey_assert(err); + + err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i); + pkey_assert(err); + } +} + +/* Assumes that all pkeys other than 'pkey' are unallocated */ +void test_pkey_syscalls_bad_args(int *ptr, u16 pkey) +{ + int err; + int bad_pkey = NR_PKEYS+99; + + /* pass a known-invalid pkey in: */ + err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey); + pkey_assert(err); +} + +void become_child(void) +{ + pid_t forkret; + + forkret = fork(); + pkey_assert(forkret >= 0); + dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); + + if (!forkret) { + /* in the child */ + return; + } + exit(0); +} + +/* Assumes that all pkeys other than 'pkey' are unallocated */ +void test_pkey_alloc_exhaust(int *ptr, u16 pkey) +{ + int err; + int allocated_pkeys[NR_PKEYS] = {0}; + int nr_allocated_pkeys = 0; + int i; + + for (i = 0; i < NR_PKEYS*3; i++) { + int new_pkey; + dprintf1("%s() alloc loop: %d\n", __func__, i); + new_pkey = alloc_pkey(); + dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx" + " shadow: 0x%016llx\n", + __func__, __LINE__, err, __read_pkey_reg(), + shadow_pkey_reg); + read_pkey_reg(); /* for shadow checking */ + dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC); + if ((new_pkey == -1) && (errno == ENOSPC)) { + dprintf2("%s() failed to allocate pkey after %d tries\n", + __func__, nr_allocated_pkeys); + } else { + /* + * Ensure the number of successes never + * exceeds the number of keys supported + * in the hardware. + */ + pkey_assert(nr_allocated_pkeys < NR_PKEYS); + allocated_pkeys[nr_allocated_pkeys++] = new_pkey; + } + + /* + * Make sure that allocation state is properly + * preserved across fork(). + */ + if (i == NR_PKEYS*2) + become_child(); + } + + dprintf3("%s()::%d\n", __func__, __LINE__); + + /* + * On x86: + * There are 16 pkeys supported in hardware. Three are + * allocated by the time we get here: + * 1. The default key (0) + * 2. One possibly consumed by an execute-only mapping. + * 3. One allocated by the test code and passed in via + * 'pkey' to this function. + * Ensure that we can allocate at least another 13 (16-3). + * + * On powerpc: + * There are either 5, 28, 29 or 32 pkeys supported in + * hardware depending on the page size (4K or 64K) and + * platform (powernv or powervm). Four are allocated by + * the time we get here. These include pkey-0, pkey-1, + * exec-only pkey and the one allocated by the test code. + * Ensure that we can allocate the remaining. + */ + pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1)); + + for (i = 0; i < nr_allocated_pkeys; i++) { + err = sys_pkey_free(allocated_pkeys[i]); + pkey_assert(!err); + read_pkey_reg(); /* for shadow checking */ + } +} + +void arch_force_pkey_reg_init(void) +{ +#if defined(__i386__) || defined(__x86_64__) /* arch */ + u64 *buf; + + /* + * All keys should be allocated and set to allow reads and + * writes, so the register should be all 0. If not, just + * skip the test. + */ + if (read_pkey_reg()) + return; + + /* + * Just allocate an absurd about of memory rather than + * doing the XSAVE size enumeration dance. + */ + buf = mmap(NULL, 1*MB, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + + /* These __builtins require compiling with -mxsave */ + + /* XSAVE to build a valid buffer: */ + __builtin_ia32_xsave(buf, XSTATE_PKEY); + /* Clear XSTATE_BV[PKRU]: */ + buf[XSTATE_BV_OFFSET/sizeof(u64)] &= ~XSTATE_PKEY; + /* XRSTOR will likely get PKRU back to the init state: */ + __builtin_ia32_xrstor(buf, XSTATE_PKEY); + + munmap(buf, 1*MB); +#endif +} + + +/* + * This is mostly useless on ppc for now. But it will not + * hurt anything and should give some better coverage as + * a long-running test that continually checks the pkey + * register. + */ +void test_pkey_init_state(int *ptr, u16 pkey) +{ + int err; + int allocated_pkeys[NR_PKEYS] = {0}; + int nr_allocated_pkeys = 0; + int i; + + for (i = 0; i < NR_PKEYS; i++) { + int new_pkey = alloc_pkey(); + + if (new_pkey < 0) + continue; + allocated_pkeys[nr_allocated_pkeys++] = new_pkey; + } + + dprintf3("%s()::%d\n", __func__, __LINE__); + + arch_force_pkey_reg_init(); + + /* + * Loop for a bit, hoping to get exercise the kernel + * context switch code. + */ + for (i = 0; i < 1000000; i++) + read_pkey_reg(); + + for (i = 0; i < nr_allocated_pkeys; i++) { + err = sys_pkey_free(allocated_pkeys[i]); + pkey_assert(!err); + read_pkey_reg(); /* for shadow checking */ + } +} + +/* + * pkey 0 is special. It is allocated by default, so you do not + * have to call pkey_alloc() to use it first. Make sure that it + * is usable. + */ +void test_mprotect_with_pkey_0(int *ptr, u16 pkey) +{ + long size; + int prot; + + assert(pkey_last_malloc_record); + size = pkey_last_malloc_record->size; + /* + * This is a bit of a hack. But mprotect() requires + * huge-page-aligned sizes when operating on hugetlbfs. + * So, make sure that we use something that's a multiple + * of a huge page when we can. + */ + if (size >= HPAGE_SIZE) + size = HPAGE_SIZE; + prot = pkey_last_malloc_record->prot; + + /* Use pkey 0 */ + mprotect_pkey(ptr, size, prot, 0); + + /* Make sure that we can set it back to the original pkey. */ + mprotect_pkey(ptr, size, prot, pkey); +} + +void test_ptrace_of_child(int *ptr, u16 pkey) +{ + __attribute__((__unused__)) int peek_result; + pid_t child_pid; + void *ignored = 0; + long ret; + int status; + /* + * This is the "control" for our little expermient. Make sure + * we can always access it when ptracing. + */ + int *plain_ptr_unaligned = malloc(HPAGE_SIZE); + int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE); + + /* + * Fork a child which is an exact copy of this process, of course. + * That means we can do all of our tests via ptrace() and then plain + * memory access and ensure they work differently. + */ + child_pid = fork_lazy_child(); + dprintf1("[%d] child pid: %d\n", getpid(), child_pid); + + ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored); + if (ret) + perror("attach"); + dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__); + pkey_assert(ret != -1); + ret = waitpid(child_pid, &status, WUNTRACED); + if ((ret != child_pid) || !(WIFSTOPPED(status))) { + fprintf(stderr, "weird waitpid result %ld stat %x\n", + ret, status); + pkey_assert(0); + } + dprintf2("waitpid ret: %ld\n", ret); + dprintf2("waitpid status: %d\n", status); + + pkey_access_deny(pkey); + pkey_write_deny(pkey); + + /* Write access, untested for now: + ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data); + pkey_assert(ret != -1); + dprintf1("poke at %p: %ld\n", peek_at, ret); + */ + + /* + * Try to access the pkey-protected "ptr" via ptrace: + */ + ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored); + /* expect it to work, without an error: */ + pkey_assert(ret != -1); + /* Now access from the current task, and expect an exception: */ + peek_result = read_ptr(ptr); + expected_pkey_fault(pkey); + + /* + * Try to access the NON-pkey-protected "plain_ptr" via ptrace: + */ + ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored); + /* expect it to work, without an error: */ + pkey_assert(ret != -1); + /* Now access from the current task, and expect NO exception: */ + peek_result = read_ptr(plain_ptr); + do_not_expect_pkey_fault("read plain pointer after ptrace"); + + ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0); + pkey_assert(ret != -1); + + ret = kill(child_pid, SIGKILL); + pkey_assert(ret != -1); + + wait(&status); + + free(plain_ptr_unaligned); +} + +void *get_pointer_to_instructions(void) +{ + void *p1; + + p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE); + dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write); + /* lots_o_noops_around_write should be page-aligned already */ + assert(p1 == &lots_o_noops_around_write); + + /* Point 'p1' at the *second* page of the function: */ + p1 += PAGE_SIZE; + + /* + * Try to ensure we fault this in on next touch to ensure + * we get an instruction fault as opposed to a data one + */ + madvise(p1, PAGE_SIZE, MADV_DONTNEED); + + return p1; +} + +void test_executing_on_unreadable_memory(int *ptr, u16 pkey) +{ + void *p1; + int scratch; + int ptr_contents; + int ret; + + p1 = get_pointer_to_instructions(); + lots_o_noops_around_write(&scratch); + ptr_contents = read_ptr(p1); + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); + + ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey); + pkey_assert(!ret); + pkey_access_deny(pkey); + + dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); + + /* + * Make sure this is an *instruction* fault + */ + madvise(p1, PAGE_SIZE, MADV_DONTNEED); + lots_o_noops_around_write(&scratch); + do_not_expect_pkey_fault("executing on PROT_EXEC memory"); + expect_fault_on_read_execonly_key(p1, pkey); +} + +void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) +{ + void *p1; + int scratch; + int ptr_contents; + int ret; + + dprintf1("%s() start\n", __func__); + + p1 = get_pointer_to_instructions(); + lots_o_noops_around_write(&scratch); + ptr_contents = read_ptr(p1); + dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); + + /* Use a *normal* mprotect(), not mprotect_pkey(): */ + ret = mprotect(p1, PAGE_SIZE, PROT_EXEC); + pkey_assert(!ret); + + /* + * Reset the shadow, assuming that the above mprotect() + * correctly changed PKRU, but to an unknown value since + * the actual allocated pkey is unknown. + */ + shadow_pkey_reg = __read_pkey_reg(); + + dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); + + /* Make sure this is an *instruction* fault */ + madvise(p1, PAGE_SIZE, MADV_DONTNEED); + lots_o_noops_around_write(&scratch); + do_not_expect_pkey_fault("executing on PROT_EXEC memory"); + expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY); + + /* + * Put the memory back to non-PROT_EXEC. Should clear the + * exec-only pkey off the VMA and allow it to be readable + * again. Go to PROT_NONE first to check for a kernel bug + * that did not clear the pkey when doing PROT_NONE. + */ + ret = mprotect(p1, PAGE_SIZE, PROT_NONE); + pkey_assert(!ret); + + ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC); + pkey_assert(!ret); + ptr_contents = read_ptr(p1); + do_not_expect_pkey_fault("plain read on recently PROT_EXEC area"); +} + +#if defined(__i386__) || defined(__x86_64__) +void test_ptrace_modifies_pkru(int *ptr, u16 pkey) +{ + u32 new_pkru; + pid_t child; + int status, ret; + int pkey_offset = pkey_reg_xstate_offset(); + size_t xsave_size = cpu_max_xsave_size(); + void *xsave; + u32 *pkey_register; + u64 *xstate_bv; + struct iovec iov; + + new_pkru = ~read_pkey_reg(); + /* Don't make PROT_EXEC mappings inaccessible */ + new_pkru &= ~3; + + child = fork(); + pkey_assert(child >= 0); + dprintf3("[%d] fork() ret: %d\n", getpid(), child); + if (!child) { + ptrace(PTRACE_TRACEME, 0, 0, 0); + /* Stop and allow the tracer to modify PKRU directly */ + raise(SIGSTOP); + + /* + * need __read_pkey_reg() version so we do not do shadow_pkey_reg + * checking + */ + if (__read_pkey_reg() != new_pkru) + exit(1); + + /* Stop and allow the tracer to clear XSTATE_BV for PKRU */ + raise(SIGSTOP); + + if (__read_pkey_reg() != 0) + exit(1); + + /* Stop and allow the tracer to examine PKRU */ + raise(SIGSTOP); + + exit(0); + } + + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); + + xsave = (void *)malloc(xsave_size); + pkey_assert(xsave > 0); + + /* Modify the PKRU register directly */ + iov.iov_base = xsave; + iov.iov_len = xsave_size; + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + + pkey_register = (u32 *)(xsave + pkey_offset); + pkey_assert(*pkey_register == read_pkey_reg()); + + *pkey_register = new_pkru; + + ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + + /* Test that the modification is visible in ptrace before any execution */ + memset(xsave, 0xCC, xsave_size); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + pkey_assert(*pkey_register == new_pkru); + + /* Execute the tracee */ + ret = ptrace(PTRACE_CONT, child, 0, 0); + pkey_assert(ret == 0); + + /* Test that the tracee saw the PKRU value change */ + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); + + /* Test that the modification is visible in ptrace after execution */ + memset(xsave, 0xCC, xsave_size); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + pkey_assert(*pkey_register == new_pkru); + + /* Clear the PKRU bit from XSTATE_BV */ + xstate_bv = (u64 *)(xsave + 512); + *xstate_bv &= ~(1 << 9); + + ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + + /* Test that the modification is visible in ptrace before any execution */ + memset(xsave, 0xCC, xsave_size); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + pkey_assert(*pkey_register == 0); + + ret = ptrace(PTRACE_CONT, child, 0, 0); + pkey_assert(ret == 0); + + /* Test that the tracee saw the PKRU value go to 0 */ + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); + + /* Test that the modification is visible in ptrace after execution */ + memset(xsave, 0xCC, xsave_size); + ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); + pkey_assert(ret == 0); + pkey_assert(*pkey_register == 0); + + ret = ptrace(PTRACE_CONT, child, 0, 0); + pkey_assert(ret == 0); + pkey_assert(child == waitpid(child, &status, 0)); + dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); + pkey_assert(WIFEXITED(status)); + pkey_assert(WEXITSTATUS(status) == 0); + free(xsave); +} +#endif + +void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) +{ + int size = PAGE_SIZE; + int sret; + + if (cpu_has_pkeys()) { + dprintf1("SKIP: %s: no CPU support\n", __func__); + return; + } + + sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey); + pkey_assert(sret < 0); +} + +void (*pkey_tests[])(int *ptr, u16 pkey) = { + test_read_of_write_disabled_region, + test_read_of_access_disabled_region, + test_read_of_access_disabled_region_with_page_already_mapped, + test_write_of_write_disabled_region, + test_write_of_write_disabled_region_with_page_already_mapped, + test_write_of_access_disabled_region, + test_write_of_access_disabled_region_with_page_already_mapped, + test_kernel_write_of_access_disabled_region, + test_kernel_write_of_write_disabled_region, + test_kernel_gup_of_access_disabled_region, + test_kernel_gup_write_to_write_disabled_region, + test_executing_on_unreadable_memory, + test_implicit_mprotect_exec_only_memory, + test_mprotect_with_pkey_0, + test_ptrace_of_child, + test_pkey_init_state, + test_pkey_syscalls_on_non_allocated_pkey, + test_pkey_syscalls_bad_args, + test_pkey_alloc_exhaust, + test_pkey_alloc_free_attach_pkey0, +#if defined(__i386__) || defined(__x86_64__) + test_ptrace_modifies_pkru, +#endif +}; + +void run_tests_once(void) +{ + int *ptr; + int prot = PROT_READ|PROT_WRITE; + + for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) { + int pkey; + int orig_pkey_faults = pkey_faults; + + dprintf1("======================\n"); + dprintf1("test %d preparing...\n", test_nr); + + tracing_on(); + pkey = alloc_random_pkey(); + dprintf1("test %d starting with pkey: %d\n", test_nr, pkey); + ptr = malloc_pkey(PAGE_SIZE, prot, pkey); + dprintf1("test %d starting...\n", test_nr); + pkey_tests[test_nr](ptr, pkey); + dprintf1("freeing test memory: %p\n", ptr); + free_pkey_malloc(ptr); + sys_pkey_free(pkey); + + dprintf1("pkey_faults: %d\n", pkey_faults); + dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults); + + tracing_off(); + close_test_fds(); + + printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr); + dprintf1("======================\n\n"); + } + iteration_nr++; +} + +void pkey_setup_shadow(void) +{ + shadow_pkey_reg = __read_pkey_reg(); +} + +int main(void) +{ + int nr_iterations = 22; + int pkeys_supported = is_pkeys_supported(); + + srand((unsigned int)time(NULL)); + + setup_handlers(); + + printf("has pkeys: %d\n", pkeys_supported); + + if (!pkeys_supported) { + int size = PAGE_SIZE; + int *ptr; + + printf("running PKEY tests for unsupported CPU/OS\n"); + + ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + assert(ptr != (void *)-1); + test_mprotect_pkey_on_unsupported_cpu(ptr, 1); + exit(0); + } + + pkey_setup_shadow(); + printf("startup pkey_reg: %016llx\n", read_pkey_reg()); + setup_hugetlbfs(); + + while (nr_iterations-- > 0) + run_tests_once(); + + printf("done (all tests OK)\n"); + return 0; +} --- /dev/null +++ a/tools/testing/selftests/mm/run_vmtests.sh @@ -0,0 +1,274 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# Please run as root + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +exitcode=0 + +usage() { + cat <<EOF +usage: ${BASH_SOURCE[0]:-$0} [ -h | -t "<categories>"] + -t: specify specific categories to tests to run + -h: display this message + +The default behavior is to run all tests. + +Alternatively, specific groups tests can be run by passing a string +to the -t argument containing one or more of the following categories +separated by spaces: +- mmap + tests for mmap(2) +- gup_test + tests for gup using gup_test interface +- userfaultfd + tests for userfaultfd(2) +- compaction + a test for the patch "Allow compaction of unevictable pages" +- mlock + tests for mlock(2) +- mremap + tests for mremap(2) +- hugevm + tests for very large virtual address space +- vmalloc + vmalloc smoke tests +- hmm + hmm smoke tests +- madv_populate + test memadvise(2) MADV_POPULATE_{READ,WRITE} options +- memfd_secret + test memfd_secret(2) +- process_mrelease + test process_mrelease(2) +- ksm + ksm tests that do not require >=2 NUMA nodes +- ksm_numa + ksm tests that require >=2 NUMA nodes +- pkey + memory protection key tests +- soft_dirty + test soft dirty page bit semantics +- cow + test copy-on-write semantics +example: ./run_vmtests.sh -t "hmm mmap ksm" +EOF + exit 0 +} + + +while getopts "ht:" OPT; do + case ${OPT} in + "h") usage ;; + "t") VM_SELFTEST_ITEMS=${OPTARG} ;; + esac +done +shift $((OPTIND -1)) + +# default behavior: run all tests +VM_SELFTEST_ITEMS=${VM_SELFTEST_ITEMS:-default} + +test_selected() { + if [ "$VM_SELFTEST_ITEMS" == "default" ]; then + # If no VM_SELFTEST_ITEMS are specified, run all tests + return 0 + fi + # If test selected argument is one of the test items + if [[ " ${VM_SELFTEST_ITEMS[*]} " =~ " ${1} " ]]; then + return 0 + else + return 1 + fi +} + +# get huge pagesize and freepages from /proc/meminfo +while read -r name size unit; do + if [ "$name" = "HugePages_Free:" ]; then + freepgs="$size" + fi + if [ "$name" = "Hugepagesize:" ]; then + hpgsize_KB="$size" + fi +done < /proc/meminfo + +# Simple hugetlbfs tests have a hardcoded minimum requirement of +# huge pages totaling 256MB (262144KB) in size. The userfaultfd +# hugetlb test requires a minimum of 2 * nr_cpus huge pages. Take +# both of these requirements into account and attempt to increase +# number of huge pages available. +nr_cpus=$(nproc) +hpgsize_MB=$((hpgsize_KB / 1024)) +half_ufd_size_MB=$((((nr_cpus * hpgsize_MB + 127) / 128) * 128)) +needmem_KB=$((half_ufd_size_MB * 2 * 1024)) + +# set proper nr_hugepages +if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then + nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) + needpgs=$((needmem_KB / hpgsize_KB)) + tries=2 + while [ "$tries" -gt 0 ] && [ "$freepgs" -lt "$needpgs" ]; do + lackpgs=$((needpgs - freepgs)) + echo 3 > /proc/sys/vm/drop_caches + if ! echo $((lackpgs + nr_hugepgs)) > /proc/sys/vm/nr_hugepages; then + echo "Please run this test as root" + exit $ksft_skip + fi + while read -r name size unit; do + if [ "$name" = "HugePages_Free:" ]; then + freepgs=$size + fi + done < /proc/meminfo + tries=$((tries - 1)) + done + if [ "$freepgs" -lt "$needpgs" ]; then + printf "Not enough huge pages available (%d < %d)\n" \ + "$freepgs" "$needpgs" + exit 1 + fi +else + echo "no hugetlbfs support in kernel?" + exit 1 +fi + +# filter 64bit architectures +ARCH64STR="arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64" +if [ -z "$ARCH" ]; then + ARCH=$(uname -m 2>/dev/null | sed -e 's/aarch64.*/arm64/') +fi +VADDR64=0 +echo "$ARCH64STR" | grep "$ARCH" &>/dev/null && VADDR64=1 + +# Usage: run_test [test binary] [arbitrary test arguments...] +run_test() { + if test_selected ${CATEGORY}; then + local title="running $*" + local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -) + printf "%s\n%s\n%s\n" "$sep" "$title" "$sep" + + "$@" + local ret=$? + if [ $ret -eq 0 ]; then + echo "[PASS]" + elif [ $ret -eq $ksft_skip ]; then + echo "[SKIP]" + exitcode=$ksft_skip + else + echo "[FAIL]" + exitcode=1 + fi + fi # test_selected +} + +CATEGORY="hugetlb" run_test ./hugepage-mmap + +shmmax=$(cat /proc/sys/kernel/shmmax) +shmall=$(cat /proc/sys/kernel/shmall) +echo 268435456 > /proc/sys/kernel/shmmax +echo 4194304 > /proc/sys/kernel/shmall +CATEGORY="hugetlb" run_test ./hugepage-shm +echo "$shmmax" > /proc/sys/kernel/shmmax +echo "$shmall" > /proc/sys/kernel/shmall + +CATEGORY="hugetlb" run_test ./map_hugetlb +CATEGORY="hugetlb" run_test ./hugepage-mremap +CATEGORY="hugetlb" run_test ./hugepage-vmemmap +CATEGORY="hugetlb" run_test ./hugetlb-madvise + +if test_selected "hugetlb"; then + echo "NOTE: These hugetlb tests provide minimal coverage. Use" + echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" + echo " hugetlb regression testing." +fi + +CATEGORY="mmap" run_test ./map_fixed_noreplace + +# get_user_pages_fast() benchmark +CATEGORY="gup_test" run_test ./gup_test -u +# pin_user_pages_fast() benchmark +CATEGORY="gup_test" run_test ./gup_test -a +# Dump pages 0, 19, and 4096, using pin_user_pages: +CATEGORY="gup_test" run_test ./gup_test -ct -F 0x1 0 19 0x1000 + +uffd_mods=("" ":dev") +for mod in "${uffd_mods[@]}"; do + CATEGORY="userfaultfd" run_test ./userfaultfd anon${mod} 20 16 + # Hugetlb tests require source and destination huge pages. Pass in half + # the size ($half_ufd_size_MB), which is used for *each*. + CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32 + CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32 + CATEGORY="userfaultfd" run_test ./userfaultfd shmem${mod} 20 16 +done + +#cleanup +echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages + +CATEGORY="compaction" run_test ./compaction_test + +CATEGORY="mlock" run_test sudo -u nobody ./on-fault-limit + +CATEGORY="mmap" run_test ./map_populate + +CATEGORY="mlock" run_test ./mlock-random-test + +CATEGORY="mlock" run_test ./mlock2-tests + +CATEGORY="process_mrelease" run_test ./mrelease_test + +CATEGORY="mremap" run_test ./mremap_test + +CATEGORY="hugetlb" run_test ./thuge-gen + +if [ $VADDR64 -ne 0 ]; then + CATEGORY="hugevm" run_test ./virtual_address_range + + # virtual address 128TB switch test + CATEGORY="hugevm" run_test ./va_128TBswitch.sh +fi # VADDR64 + +# vmalloc stability smoke test +CATEGORY="vmalloc" run_test ./test_vmalloc.sh smoke + +CATEGORY="mremap" run_test ./mremap_dontunmap + +CATEGORY="hmm" run_test ./test_hmm.sh smoke + +# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests +CATEGORY="madv_populate" run_test ./madv_populate + +CATEGORY="memfd_secret" run_test ./memfd_secret + +# KSM MADV_MERGEABLE test with 10 identical pages +CATEGORY="ksm" run_test ./ksm_tests -M -p 10 +# KSM unmerge test +CATEGORY="ksm" run_test ./ksm_tests -U +# KSM test with 10 zero pages and use_zero_pages = 0 +CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 0 +# KSM test with 10 zero pages and use_zero_pages = 1 +CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 1 +# KSM test with 2 NUMA nodes and merge_across_nodes = 1 +CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 1 +# KSM test with 2 NUMA nodes and merge_across_nodes = 0 +CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 0 + +CATEGORY="ksm" run_test ./ksm_functional_tests + +run_test ./ksm_functional_tests + +# protection_keys tests +if [ -x ./protection_keys_32 ] +then + CATEGORY="pkey" run_test ./protection_keys_32 +fi + +if [ -x ./protection_keys_64 ] +then + CATEGORY="pkey" run_test ./protection_keys_64 +fi + +CATEGORY="soft_dirty" run_test ./soft-dirty + +# COW tests +CATEGORY="cow" run_test ./cow + +exit $exitcode --- /dev/null +++ a/tools/testing/selftests/mm/settings @@ -0,0 +1 @@ +timeout=45 --- /dev/null +++ a/tools/testing/selftests/mm/soft-dirty.c @@ -0,0 +1,210 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdio.h> +#include <string.h> +#include <stdbool.h> +#include <fcntl.h> +#include <stdint.h> +#include <malloc.h> +#include <sys/mman.h> +#include "../kselftest.h" +#include "vm_util.h" + +#define PAGEMAP_FILE_PATH "/proc/self/pagemap" +#define TEST_ITERATIONS 10000 + +static void test_simple(int pagemap_fd, int pagesize) +{ + int i; + char *map; + + map = aligned_alloc(pagesize, pagesize); + if (!map) + ksft_exit_fail_msg("mmap failed\n"); + + clear_softdirty(); + + for (i = 0 ; i < TEST_ITERATIONS; i++) { + if (pagemap_is_softdirty(pagemap_fd, map) == 1) { + ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i); + break; + } + + clear_softdirty(); + // Write something to the page to get the dirty bit enabled on the page + map[0]++; + + if (pagemap_is_softdirty(pagemap_fd, map) == 0) { + ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i); + break; + } + + clear_softdirty(); + } + free(map); + + ksft_test_result(i == TEST_ITERATIONS, "Test %s\n", __func__); +} + +static void test_vma_reuse(int pagemap_fd, int pagesize) +{ + char *map, *map2; + + map = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap failed"); + + // The kernel always marks new regions as soft dirty + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s dirty bit of allocated page\n", __func__); + + clear_softdirty(); + munmap(map, pagesize); + + map2 = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0); + if (map2 == MAP_FAILED) + ksft_exit_fail_msg("mmap failed"); + + // Dirty bit is set for new regions even if they are reused + if (map == map2) + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1, + "Test %s dirty bit of reused address page\n", __func__); + else + ksft_test_result_skip("Test %s dirty bit of reused address page\n", __func__); + + munmap(map2, pagesize); +} + +static void test_hugepage(int pagemap_fd, int pagesize) +{ + char *map; + int i, ret; + size_t hpage_len = read_pmd_pagesize(); + + map = memalign(hpage_len, hpage_len); + if (!map) + ksft_exit_fail_msg("memalign failed\n"); + + ret = madvise(map, hpage_len, MADV_HUGEPAGE); + if (ret) + ksft_exit_fail_msg("madvise failed %d\n", ret); + + for (i = 0; i < hpage_len; i++) + map[i] = (char)i; + + if (check_huge_anon(map, 1, hpage_len)) { + ksft_test_result_pass("Test %s huge page allocation\n", __func__); + + clear_softdirty(); + for (i = 0 ; i < TEST_ITERATIONS ; i++) { + if (pagemap_is_softdirty(pagemap_fd, map) == 1) { + ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i); + break; + } + + clear_softdirty(); + // Write something to the page to get the dirty bit enabled on the page + map[0]++; + + if (pagemap_is_softdirty(pagemap_fd, map) == 0) { + ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i); + break; + } + clear_softdirty(); + } + + ksft_test_result(i == TEST_ITERATIONS, "Test %s huge page dirty bit\n", __func__); + } else { + // hugepage allocation failed. skip these tests + ksft_test_result_skip("Test %s huge page allocation\n", __func__); + ksft_test_result_skip("Test %s huge page dirty bit\n", __func__); + } + free(map); +} + +static void test_mprotect(int pagemap_fd, int pagesize, bool anon) +{ + const char *type[] = {"file", "anon"}; + const char *fname = "./soft-dirty-test-file"; + int test_fd; + char *map; + + if (anon) { + map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + if (!map) + ksft_exit_fail_msg("anon mmap failed\n"); + } else { + test_fd = open(fname, O_RDWR | O_CREAT); + if (test_fd < 0) { + ksft_test_result_skip("Test %s open() file failed\n", __func__); + return; + } + unlink(fname); + ftruncate(test_fd, pagesize); + map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, + MAP_SHARED, test_fd, 0); + if (!map) + ksft_exit_fail_msg("file mmap failed\n"); + } + + *map = 1; + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s-%s dirty bit of new written page\n", + __func__, type[anon]); + clear_softdirty(); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, + "Test %s-%s soft-dirty clear after clear_refs\n", + __func__, type[anon]); + mprotect(map, pagesize, PROT_READ); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, + "Test %s-%s soft-dirty clear after marking RO\n", + __func__, type[anon]); + mprotect(map, pagesize, PROT_READ|PROT_WRITE); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, + "Test %s-%s soft-dirty clear after marking RW\n", + __func__, type[anon]); + *map = 2; + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s-%s soft-dirty after rewritten\n", + __func__, type[anon]); + + munmap(map, pagesize); + + if (!anon) + close(test_fd); +} + +static void test_mprotect_anon(int pagemap_fd, int pagesize) +{ + test_mprotect(pagemap_fd, pagesize, true); +} + +static void test_mprotect_file(int pagemap_fd, int pagesize) +{ + test_mprotect(pagemap_fd, pagesize, false); +} + +int main(int argc, char **argv) +{ + int pagemap_fd; + int pagesize; + + ksft_print_header(); + ksft_set_plan(15); + + pagemap_fd = open(PAGEMAP_FILE_PATH, O_RDONLY); + if (pagemap_fd < 0) + ksft_exit_fail_msg("Failed to open %s\n", PAGEMAP_FILE_PATH); + + pagesize = getpagesize(); + + test_simple(pagemap_fd, pagesize); + test_vma_reuse(pagemap_fd, pagesize); + test_hugepage(pagemap_fd, pagesize); + test_mprotect_anon(pagemap_fd, pagesize); + test_mprotect_file(pagemap_fd, pagesize); + + close(pagemap_fd); + + return ksft_exit_pass(); +} --- /dev/null +++ a/tools/testing/selftests/mm/split_huge_page_test.c @@ -0,0 +1,309 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual + * address range in a process via <debugfs>/split_huge_pages interface. + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <stdarg.h> +#include <unistd.h> +#include <inttypes.h> +#include <string.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/mount.h> +#include <malloc.h> +#include <stdbool.h> +#include "vm_util.h" + +uint64_t pagesize; +unsigned int pageshift; +uint64_t pmd_pagesize; + +#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages" +#define INPUT_MAX 80 + +#define PID_FMT "%d,0x%lx,0x%lx" +#define PATH_FMT "%s,0x%lx,0x%lx" + +#define PFN_MASK ((1UL<<55)-1) +#define KPF_THP (1UL<<22) + +int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file) +{ + uint64_t paddr; + uint64_t page_flags; + + if (pagemap_file) { + pread(pagemap_file, &paddr, sizeof(paddr), + ((long)vaddr >> pageshift) * sizeof(paddr)); + + if (kpageflags_file) { + pread(kpageflags_file, &page_flags, sizeof(page_flags), + (paddr & PFN_MASK) * sizeof(page_flags)); + + return !!(page_flags & KPF_THP); + } + } + return 0; +} + +static int write_file(const char *path, const char *buf, size_t buflen) +{ + int fd; + ssize_t numwritten; + + fd = open(path, O_WRONLY); + if (fd == -1) + return 0; + + numwritten = write(fd, buf, buflen - 1); + close(fd); + if (numwritten < 1) + return 0; + + return (unsigned int) numwritten; +} + +static void write_debugfs(const char *fmt, ...) +{ + char input[INPUT_MAX]; + int ret; + va_list argp; + + va_start(argp, fmt); + ret = vsnprintf(input, INPUT_MAX, fmt, argp); + va_end(argp); + + if (ret >= INPUT_MAX) { + printf("%s: Debugfs input is too long\n", __func__); + exit(EXIT_FAILURE); + } + + if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) { + perror(SPLIT_DEBUGFS); + exit(EXIT_FAILURE); + } +} + +void split_pmd_thp(void) +{ + char *one_page; + size_t len = 4 * pmd_pagesize; + size_t i; + + one_page = memalign(pmd_pagesize, len); + + if (!one_page) { + printf("Fail to allocate memory\n"); + exit(EXIT_FAILURE); + } + + madvise(one_page, len, MADV_HUGEPAGE); + + for (i = 0; i < len; i++) + one_page[i] = (char)i; + + if (!check_huge_anon(one_page, 1, pmd_pagesize)) { + printf("No THP is allocated\n"); + exit(EXIT_FAILURE); + } + + /* split all THPs */ + write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, + (uint64_t)one_page + len); + + for (i = 0; i < len; i++) + if (one_page[i] != (char)i) { + printf("%ld byte corrupted\n", i); + exit(EXIT_FAILURE); + } + + + if (check_huge_anon(one_page, 0, pmd_pagesize)) { + printf("Still AnonHugePages not split\n"); + exit(EXIT_FAILURE); + } + + printf("Split huge pages successful\n"); + free(one_page); +} + +void split_pte_mapped_thp(void) +{ + char *one_page, *pte_mapped, *pte_mapped2; + size_t len = 4 * pmd_pagesize; + uint64_t thp_size; + size_t i; + const char *pagemap_template = "/proc/%d/pagemap"; + const char *kpageflags_proc = "/proc/kpageflags"; + char pagemap_proc[255]; + int pagemap_fd; + int kpageflags_fd; + + if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) { + perror("get pagemap proc error"); + exit(EXIT_FAILURE); + } + pagemap_fd = open(pagemap_proc, O_RDONLY); + + if (pagemap_fd == -1) { + perror("read pagemap:"); + exit(EXIT_FAILURE); + } + + kpageflags_fd = open(kpageflags_proc, O_RDONLY); + + if (kpageflags_fd == -1) { + perror("read kpageflags:"); + exit(EXIT_FAILURE); + } + + one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + + madvise(one_page, len, MADV_HUGEPAGE); + + for (i = 0; i < len; i++) + one_page[i] = (char)i; + + if (!check_huge_anon(one_page, 1, pmd_pagesize)) { + printf("No THP is allocated\n"); + exit(EXIT_FAILURE); + } + + /* remap the first pagesize of first THP */ + pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE); + + /* remap the Nth pagesize of Nth THP */ + for (i = 1; i < 4; i++) { + pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i, + pagesize, pagesize, + MREMAP_MAYMOVE|MREMAP_FIXED, + pte_mapped + pagesize * i); + if (pte_mapped2 == (char *)-1) { + perror("mremap failed"); + exit(EXIT_FAILURE); + } + } + + /* smap does not show THPs after mremap, use kpageflags instead */ + thp_size = 0; + for (i = 0; i < pagesize * 4; i++) + if (i % pagesize == 0 && + is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) + thp_size++; + + if (thp_size != 4) { + printf("Some THPs are missing during mremap\n"); + exit(EXIT_FAILURE); + } + + /* split all remapped THPs */ + write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped, + (uint64_t)pte_mapped + pagesize * 4); + + /* smap does not show THPs after mremap, use kpageflags instead */ + thp_size = 0; + for (i = 0; i < pagesize * 4; i++) { + if (pte_mapped[i] != (char)i) { + printf("%ld byte corrupted\n", i); + exit(EXIT_FAILURE); + } + if (i % pagesize == 0 && + is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) + thp_size++; + } + + if (thp_size) { + printf("Still %ld THPs not split\n", thp_size); + exit(EXIT_FAILURE); + } + + printf("Split PTE-mapped huge pages successful\n"); + munmap(one_page, len); + close(pagemap_fd); + close(kpageflags_fd); +} + +void split_file_backed_thp(void) +{ + int status; + int fd; + ssize_t num_written; + char tmpfs_template[] = "/tmp/thp_split_XXXXXX"; + const char *tmpfs_loc = mkdtemp(tmpfs_template); + char testfile[INPUT_MAX]; + uint64_t pgoff_start = 0, pgoff_end = 1024; + + printf("Please enable pr_debug in split_huge_pages_in_file() if you need more info.\n"); + + status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m"); + + if (status) { + printf("Unable to create a tmpfs for testing\n"); + exit(EXIT_FAILURE); + } + + status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc); + if (status >= INPUT_MAX) { + printf("Fail to create file-backed THP split testing file\n"); + goto cleanup; + } + + fd = open(testfile, O_CREAT|O_WRONLY); + if (fd == -1) { + perror("Cannot open testing file\n"); + goto cleanup; + } + + /* write something to the file, so a file-backed THP can be allocated */ + num_written = write(fd, tmpfs_loc, strlen(tmpfs_loc) + 1); + close(fd); + + if (num_written < 1) { + printf("Fail to write data to testing file\n"); + goto cleanup; + } + + /* split the file-backed THP */ + write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end); + + status = unlink(testfile); + if (status) + perror("Cannot remove testing file\n"); + +cleanup: + status = umount(tmpfs_loc); + if (status) { + printf("Unable to umount %s\n", tmpfs_loc); + exit(EXIT_FAILURE); + } + status = rmdir(tmpfs_loc); + if (status) { + perror("cannot remove tmp dir"); + exit(EXIT_FAILURE); + } + + printf("file-backed THP split test done, please check dmesg for more information\n"); +} + +int main(int argc, char **argv) +{ + if (geteuid() != 0) { + printf("Please run the benchmark as root\n"); + exit(EXIT_FAILURE); + } + + pagesize = getpagesize(); + pageshift = ffs(pagesize) - 1; + pmd_pagesize = read_pmd_pagesize(); + + split_pmd_thp(); + split_pte_mapped_thp(); + split_file_backed_thp(); + + return 0; +} --- /dev/null +++ a/tools/testing/selftests/mm/test_hmm.sh @@ -0,0 +1,105 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@xxxxxxxxx> +# +# This is a test script for the kernel test driver to analyse vmalloc +# allocator. Therefore it is just a kernel module loader. You can specify +# and pass different parameters in order to: +# a) analyse performance of vmalloc allocations; +# b) stressing and stability check of vmalloc subsystem. + +TEST_NAME="test_hmm" +DRIVER="test_hmm" + +# 1 if fails +exitcode=1 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +check_test_requirements() +{ + uid=$(id -u) + if [ $uid -ne 0 ]; then + echo "$0: Must be run as root" + exit $ksft_skip + fi + + if ! which modprobe > /dev/null 2>&1; then + echo "$0: You need modprobe installed" + exit $ksft_skip + fi + + if ! modinfo $DRIVER > /dev/null 2>&1; then + echo "$0: You must have the following enabled in your kernel:" + echo "CONFIG_TEST_HMM=m" + exit $ksft_skip + fi +} + +load_driver() +{ + if [ $# -eq 0 ]; then + modprobe $DRIVER > /dev/null 2>&1 + else + if [ $# -eq 2 ]; then + modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2 + > /dev/null 2>&1 + else + echo "Missing module parameters. Make sure pass"\ + "spm_addr_dev0 and spm_addr_dev1" + usage + fi + fi +} + +unload_driver() +{ + modprobe -r $DRIVER > /dev/null 2>&1 +} + +run_smoke() +{ + echo "Running smoke test. Note, this test provides basic coverage." + + load_driver $1 $2 + $(dirname "${BASH_SOURCE[0]}")/hmm-tests + unload_driver +} + +usage() +{ + echo -n "Usage: $0" + echo + echo "Example usage:" + echo + echo "# Shows help message" + echo "./${TEST_NAME}.sh" + echo + echo "# Smoke testing" + echo "./${TEST_NAME}.sh smoke" + echo + echo "# Smoke testing with SPM enabled" + echo "./${TEST_NAME}.sh smoke <spm_addr_dev0> <spm_addr_dev1>" + echo + exit 0 +} + +function run_test() +{ + if [ $# -eq 0 ]; then + usage + else + if [ "$1" = "smoke" ]; then + run_smoke $2 $3 + else + usage + fi + fi +} + +check_test_requirements +run_test $@ + +exit 0 --- /dev/null +++ a/tools/testing/selftests/mm/test_vmalloc.sh @@ -0,0 +1,177 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@xxxxxxxxx> +# +# This is a test script for the kernel test driver to analyse vmalloc +# allocator. Therefore it is just a kernel module loader. You can specify +# and pass different parameters in order to: +# a) analyse performance of vmalloc allocations; +# b) stressing and stability check of vmalloc subsystem. + +TEST_NAME="vmalloc" +DRIVER="test_${TEST_NAME}" +NUM_CPUS=`grep -c ^processor /proc/cpuinfo` + +# 1 if fails +exitcode=1 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +# +# Static templates for performance, stressing and smoke tests. +# Also it is possible to pass any supported parameters manualy. +# +PERF_PARAM="sequential_test_order=1 test_repeat_count=3" +SMOKE_PARAM="test_loop_count=10000 test_repeat_count=10" +STRESS_PARAM="nr_threads=$NUM_CPUS test_repeat_count=20" + +check_test_requirements() +{ + uid=$(id -u) + if [ $uid -ne 0 ]; then + echo "$0: Must be run as root" + exit $ksft_skip + fi + + if ! which modprobe > /dev/null 2>&1; then + echo "$0: You need modprobe installed" + exit $ksft_skip + fi + + if ! modinfo $DRIVER > /dev/null 2>&1; then + echo "$0: You must have the following enabled in your kernel:" + echo "CONFIG_TEST_VMALLOC=m" + exit $ksft_skip + fi +} + +run_perfformance_check() +{ + echo "Run performance tests to evaluate how fast vmalloc allocation is." + echo "It runs all test cases on one single CPU with sequential order." + + modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1 + echo "Done." + echo "Ccheck the kernel message buffer to see the summary." +} + +run_stability_check() +{ + echo "Run stability tests. In order to stress vmalloc subsystem all" + echo "available test cases are run by NUM_CPUS workers simultaneously." + echo "It will take time, so be patient." + + modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1 + echo "Done." + echo "Check the kernel ring buffer to see the summary." +} + +run_smoke_check() +{ + echo "Run smoke test. Note, this test provides basic coverage." + echo "Please check $0 output how it can be used" + echo "for deep performance analysis as well as stress testing." + + modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1 + echo "Done." + echo "Check the kernel ring buffer to see the summary." +} + +usage() +{ + echo -n "Usage: $0 [ performance ] | [ stress ] | | [ smoke ] | " + echo "manual parameters" + echo + echo "Valid tests and parameters:" + echo + modinfo $DRIVER + echo + echo "Example usage:" + echo + echo "# Shows help message" + echo "./${DRIVER}.sh" + echo + echo "# Runs 1 test(id_1), repeats it 5 times by NUM_CPUS workers" + echo "./${DRIVER}.sh nr_threads=$NUM_CPUS run_test_mask=1 test_repeat_count=5" + echo + echo -n "# Runs 4 tests(id_1|id_2|id_4|id_16) on one CPU with " + echo "sequential order" + echo -n "./${DRIVER}.sh sequential_test_order=1 " + echo "run_test_mask=23" + echo + echo -n "# Runs all tests by NUM_CPUS workers, shuffled order, repeats " + echo "20 times" + echo "./${DRIVER}.sh nr_threads=$NUM_CPUS test_repeat_count=20" + echo + echo "# Performance analysis" + echo "./${DRIVER}.sh performance" + echo + echo "# Stress testing" + echo "./${DRIVER}.sh stress" + echo + exit 0 +} + +function validate_passed_args() +{ + VALID_ARGS=`modinfo $DRIVER | awk '/parm:/ {print $2}' | sed 's/:.*//'` + + # + # Something has been passed, check it. + # + for passed_arg in $@; do + key=${passed_arg//=*/} + val="${passed_arg:$((${#key}+1))}" + valid=0 + + for valid_arg in $VALID_ARGS; do + if [[ $key = $valid_arg ]] && [[ $val -gt 0 ]]; then + valid=1 + break + fi + done + + if [[ $valid -ne 1 ]]; then + echo "Error: key or value is not correct: ${key} $val" + exit $exitcode + fi + done +} + +function run_manual_check() +{ + # + # Validate passed parameters. If there is wrong one, + # the script exists and does not execute further. + # + validate_passed_args $@ + + echo "Run the test with following parameters: $@" + modprobe $DRIVER $@ > /dev/null 2>&1 + echo "Done." + echo "Check the kernel ring buffer to see the summary." +} + +function run_test() +{ + if [ $# -eq 0 ]; then + usage + else + if [[ "$1" = "performance" ]]; then + run_perfformance_check + elif [[ "$1" = "stress" ]]; then + run_stability_check + elif [[ "$1" = "smoke" ]]; then + run_smoke_check + else + run_manual_check $@ + fi + fi +} + +check_test_requirements +run_test $@ + +exit 0 --- /dev/null +++ a/tools/testing/selftests/mm/thuge-gen.c @@ -0,0 +1,257 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Test selecting other page sizes for mmap/shmget. + + Before running this huge pages for each huge page size must have been + reserved. + For large pages beyond MAX_ORDER (like 1GB on x86) boot options must be used. + Also shmmax must be increased. + And you need to run as root to work around some weird permissions in shm. + And nothing using huge pages should run in parallel. + When the program aborts you may need to clean up the shm segments with + ipcrm -m by hand, like this + sudo ipcs | awk '$1 == "0x00000000" {print $2}' | xargs -n1 sudo ipcrm -m + (warning this will remove all if someone else uses them) */ + +#define _GNU_SOURCE 1 +#include <sys/mman.h> +#include <stdlib.h> +#include <stdio.h> +#include <sys/ipc.h> +#include <sys/shm.h> +#include <sys/stat.h> +#include <glob.h> +#include <assert.h> +#include <unistd.h> +#include <stdarg.h> +#include <string.h> + +#define err(x) perror(x), exit(1) + +#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) +#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) +#define MAP_HUGE_SHIFT 26 +#define MAP_HUGE_MASK 0x3f +#if !defined(MAP_HUGETLB) +#define MAP_HUGETLB 0x40000 +#endif + +#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */ +#define SHM_HUGE_SHIFT 26 +#define SHM_HUGE_MASK 0x3f +#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT) +#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT) + +#define NUM_PAGESIZES 5 + +#define NUM_PAGES 4 + +#define Dprintf(fmt...) // printf(fmt) + +unsigned long page_sizes[NUM_PAGESIZES]; +int num_page_sizes; + +int ilog2(unsigned long v) +{ + int l = 0; + while ((1UL << l) < v) + l++; + return l; +} + +void find_pagesizes(void) +{ + glob_t g; + int i; + glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g); + assert(g.gl_pathc <= NUM_PAGESIZES); + for (i = 0; i < g.gl_pathc; i++) { + sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB", + &page_sizes[i]); + page_sizes[i] <<= 10; + printf("Found %luMB\n", page_sizes[i] >> 20); + } + num_page_sizes = g.gl_pathc; + globfree(&g); +} + +unsigned long default_huge_page_size(void) +{ + unsigned long hps = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + if (!f) + return 0; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { + hps <<= 10; + break; + } + } + free(line); + return hps; +} + +void show(unsigned long ps) +{ + char buf[100]; + if (ps == getpagesize()) + return; + printf("%luMB: ", ps >> 20); + fflush(stdout); + snprintf(buf, sizeof buf, + "cat /sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", + ps >> 10); + system(buf); +} + +unsigned long read_sysfs(int warn, char *fmt, ...) +{ + char *line = NULL; + size_t linelen = 0; + char buf[100]; + FILE *f; + va_list ap; + unsigned long val = 0; + + va_start(ap, fmt); + vsnprintf(buf, sizeof buf, fmt, ap); + va_end(ap); + + f = fopen(buf, "r"); + if (!f) { + if (warn) + printf("missing %s\n", buf); + return 0; + } + if (getline(&line, &linelen, f) > 0) { + sscanf(line, "%lu", &val); + } + fclose(f); + free(line); + return val; +} + +unsigned long read_free(unsigned long ps) +{ + return read_sysfs(ps != getpagesize(), + "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", + ps >> 10); +} + +void test_mmap(unsigned long size, unsigned flags) +{ + char *map; + unsigned long before, after; + int err; + + before = read_free(size); + map = mmap(NULL, size*NUM_PAGES, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB|flags, -1, 0); + + if (map == (char *)-1) err("mmap"); + memset(map, 0xff, size*NUM_PAGES); + after = read_free(size); + Dprintf("before %lu after %lu diff %ld size %lu\n", + before, after, before - after, size); + assert(size == getpagesize() || (before - after) == NUM_PAGES); + show(size); + err = munmap(map, size); + assert(!err); +} + +void test_shmget(unsigned long size, unsigned flags) +{ + int id; + unsigned long before, after; + int err; + + before = read_free(size); + id = shmget(IPC_PRIVATE, size * NUM_PAGES, IPC_CREAT|0600|flags); + if (id < 0) err("shmget"); + + struct shm_info i; + if (shmctl(id, SHM_INFO, (void *)&i) < 0) err("shmctl"); + Dprintf("alloc %lu res %lu\n", i.shm_tot, i.shm_rss); + + + Dprintf("id %d\n", id); + char *map = shmat(id, NULL, 0600); + if (map == (char*)-1) err("shmat"); + + shmctl(id, IPC_RMID, NULL); + + memset(map, 0xff, size*NUM_PAGES); + after = read_free(size); + + Dprintf("before %lu after %lu diff %ld size %lu\n", + before, after, before - after, size); + assert(size == getpagesize() || (before - after) == NUM_PAGES); + show(size); + err = shmdt(map); + assert(!err); +} + +void sanity_checks(void) +{ + int i; + unsigned long largest = getpagesize(); + + for (i = 0; i < num_page_sizes; i++) { + if (page_sizes[i] > largest) + largest = page_sizes[i]; + + if (read_free(page_sizes[i]) < NUM_PAGES) { + printf("Not enough huge pages for page size %lu MB, need %u\n", + page_sizes[i] >> 20, + NUM_PAGES); + exit(0); + } + } + + if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) { + printf("Please do echo %lu > /proc/sys/kernel/shmmax", largest * NUM_PAGES); + exit(0); + } + +#if defined(__x86_64__) + if (largest != 1U<<30) { + printf("No GB pages available on x86-64\n" + "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES); + exit(0); + } +#endif +} + +int main(void) +{ + int i; + unsigned default_hps = default_huge_page_size(); + + find_pagesizes(); + + sanity_checks(); + + for (i = 0; i < num_page_sizes; i++) { + unsigned long ps = page_sizes[i]; + int arg = ilog2(ps) << MAP_HUGE_SHIFT; + printf("Testing %luMB mmap with shift %x\n", ps >> 20, arg); + test_mmap(ps, MAP_HUGETLB | arg); + } + printf("Testing default huge mmap\n"); + test_mmap(default_hps, SHM_HUGETLB); + + puts("Testing non-huge shmget"); + test_shmget(getpagesize(), 0); + + for (i = 0; i < num_page_sizes; i++) { + unsigned long ps = page_sizes[i]; + int arg = ilog2(ps) << SHM_HUGE_SHIFT; + printf("Testing %luMB shmget with shift %x\n", ps >> 20, arg); + test_shmget(ps, SHM_HUGETLB | arg); + } + puts("default huge shmget"); + test_shmget(default_hps, SHM_HUGETLB); + + return 0; +} --- /dev/null +++ a/tools/testing/selftests/mm/transhuge-stress.c @@ -0,0 +1,122 @@ +/* + * Stress test for transparent huge pages, memory compaction and migration. + * + * Authors: Konstantin Khlebnikov <koct9i@xxxxxxxxx> + * + * This is free and unencumbered software released into the public domain. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <stdint.h> +#include <err.h> +#include <time.h> +#include <unistd.h> +#include <fcntl.h> +#include <string.h> +#include <sys/mman.h> +#include "util.h" + +int backing_fd = -1; +int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE; +#define PROT_RW (PROT_READ | PROT_WRITE) + +int main(int argc, char **argv) +{ + size_t ram, len; + void *ptr, *p; + struct timespec a, b; + int i = 0; + char *name = NULL; + double s; + uint8_t *map; + size_t map_len; + int pagemap_fd; + + ram = sysconf(_SC_PHYS_PAGES); + if (ram > SIZE_MAX / sysconf(_SC_PAGESIZE) / 4) + ram = SIZE_MAX / 4; + else + ram *= sysconf(_SC_PAGESIZE); + len = ram; + + while (++i < argc) { + if (!strcmp(argv[i], "-h")) + errx(1, "usage: %s [size in MiB]", argv[0]); + else if (!strcmp(argv[i], "-f")) + name = argv[++i]; + else + len = atoll(argv[i]) << 20; + } + + if (name) { + backing_fd = open(name, O_RDWR); + if (backing_fd == -1) + errx(2, "open %s", name); + mmap_flags = MAP_SHARED; + } + + warnx("allocate %zd transhuge pages, using %zd MiB virtual memory" + " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20, + ram >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1)); + + pagemap_fd = open("/proc/self/pagemap", O_RDONLY); + if (pagemap_fd < 0) + err(2, "open pagemap"); + + len -= len % HPAGE_SIZE; + ptr = mmap(NULL, len + HPAGE_SIZE, PROT_RW, mmap_flags, backing_fd, 0); + if (ptr == MAP_FAILED) + err(2, "initial mmap"); + ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE; + + if (madvise(ptr, len, MADV_HUGEPAGE)) + err(2, "MADV_HUGEPAGE"); + + map_len = ram >> (HPAGE_SHIFT - 1); + map = malloc(map_len); + if (!map) + errx(2, "map malloc"); + + while (1) { + int nr_succeed = 0, nr_failed = 0, nr_pages = 0; + + memset(map, 0, map_len); + + clock_gettime(CLOCK_MONOTONIC, &a); + for (p = ptr; p < ptr + len; p += HPAGE_SIZE) { + int64_t pfn; + + pfn = allocate_transhuge(p, pagemap_fd); + + if (pfn < 0) { + nr_failed++; + } else { + size_t idx = pfn >> (HPAGE_SHIFT - PAGE_SHIFT); + + nr_succeed++; + if (idx >= map_len) { + map = realloc(map, idx + 1); + if (!map) + errx(2, "map realloc"); + memset(map + map_len, 0, idx + 1 - map_len); + map_len = idx + 1; + } + if (!map[idx]) + nr_pages++; + map[idx] = 1; + } + + /* split transhuge page, keep last page */ + if (madvise(p, HPAGE_SIZE - PAGE_SIZE, MADV_DONTNEED)) + err(2, "MADV_DONTNEED"); + } + clock_gettime(CLOCK_MONOTONIC, &b); + s = b.tv_sec - a.tv_sec + (b.tv_nsec - a.tv_nsec) / 1000000000.; + + warnx("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t" + "%4d succeed, %4d failed, %4d different pages", + s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20), + nr_succeed, nr_failed, nr_pages); + } +} --- /dev/null +++ a/tools/testing/selftests/mm/userfaultfd.c @@ -0,0 +1,1858 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Stress userfaultfd syscall. + * + * Copyright (C) 2015 Red Hat, Inc. + * + * This test allocates two virtual areas and bounces the physical + * memory across the two virtual areas (from area_src to area_dst) + * using userfaultfd. + * + * There are three threads running per CPU: + * + * 1) one per-CPU thread takes a per-page pthread_mutex in a random + * page of the area_dst (while the physical page may still be in + * area_src), and increments a per-page counter in the same page, + * and checks its value against a verification region. + * + * 2) another per-CPU thread handles the userfaults generated by + * thread 1 above. userfaultfd blocking reads or poll() modes are + * exercised interleaved. + * + * 3) one last per-CPU thread transfers the memory in the background + * at maximum bandwidth (if not already transferred by thread + * 2). Each cpu thread takes cares of transferring a portion of the + * area. + * + * When all threads of type 3 completed the transfer, one bounce is + * complete. area_src and area_dst are then swapped. All threads are + * respawned and so the bounce is immediately restarted in the + * opposite direction. + * + * per-CPU threads 1 by triggering userfaults inside + * pthread_mutex_lock will also verify the atomicity of the memory + * transfer (UFFDIO_COPY). + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <errno.h> +#include <unistd.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <time.h> +#include <signal.h> +#include <poll.h> +#include <string.h> +#include <linux/mman.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include <sys/ioctl.h> +#include <sys/wait.h> +#include <pthread.h> +#include <linux/userfaultfd.h> +#include <setjmp.h> +#include <stdbool.h> +#include <assert.h> +#include <inttypes.h> +#include <stdint.h> +#include <sys/random.h> + +#include "../kselftest.h" +#include "vm_util.h" + +#ifdef __NR_userfaultfd + +static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; + +#define BOUNCE_RANDOM (1<<0) +#define BOUNCE_RACINGFAULTS (1<<1) +#define BOUNCE_VERIFY (1<<2) +#define BOUNCE_POLL (1<<3) +static int bounces; + +#define TEST_ANON 1 +#define TEST_HUGETLB 2 +#define TEST_SHMEM 3 +static int test_type; + +#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY) + +#define BASE_PMD_ADDR ((void *)(1UL << 30)) + +/* test using /dev/userfaultfd, instead of userfaultfd(2) */ +static bool test_dev_userfaultfd; + +/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ +#define ALARM_INTERVAL_SECS 10 +static volatile bool test_uffdio_copy_eexist = true; +static volatile bool test_uffdio_zeropage_eexist = true; +/* Whether to test uffd write-protection */ +static bool test_uffdio_wp = true; +/* Whether to test uffd minor faults */ +static bool test_uffdio_minor = false; +static bool map_shared; +static int mem_fd; +static unsigned long long *count_verify; +static int uffd = -1; +static int uffd_flags, finished, *pipefd; +static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; +static char *zeropage; +pthread_attr_t attr; +static bool test_collapse; + +/* Userfaultfd test statistics */ +struct uffd_stats { + int cpu; + unsigned long missing_faults; + unsigned long wp_faults; + unsigned long minor_faults; +}; + +/* pthread_mutex_t starts at page offset 0 */ +#define area_mutex(___area, ___nr) \ + ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) +/* + * count is placed in the page after pthread_mutex_t naturally aligned + * to avoid non alignment faults on non-x86 archs. + */ +#define area_count(___area, ___nr) \ + ((volatile unsigned long long *) ((unsigned long) \ + ((___area) + (___nr)*page_size + \ + sizeof(pthread_mutex_t) + \ + sizeof(unsigned long long) - 1) & \ + ~(unsigned long)(sizeof(unsigned long long) \ + - 1))) + +#define swap(a, b) \ + do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) + +#define factor_of_2(x) ((x) ^ ((x) & ((x) - 1))) + +const char *examples = + "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" + "./userfaultfd anon 100 99999\n\n" + "# Run the same anonymous memory test, but using /dev/userfaultfd:\n" + "./userfaultfd anon:dev 100 99999\n\n" + "# Run share memory test on 1GiB region with 99 bounces:\n" + "./userfaultfd shmem 1000 99\n\n" + "# Run hugetlb memory test on 256MiB region with 50 bounces:\n" + "./userfaultfd hugetlb 256 50\n\n" + "# Run the same hugetlb test but using shared file:\n" + "./userfaultfd hugetlb_shared 256 50\n\n" + "# 10MiB-~6GiB 999 bounces anonymous test, " + "continue forever unless an error triggers\n" + "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n"; + +static void usage(void) +{ + fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> " + "[hugetlbfs_file]\n\n"); + fprintf(stderr, "Supported <test type>: anon, hugetlb, " + "hugetlb_shared, shmem\n\n"); + fprintf(stderr, "'Test mods' can be joined to the test type string with a ':'. " + "Supported mods:\n"); + fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n"); + fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n"); + fprintf(stderr, "\tcollapse - Test MADV_COLLAPSE of UFFDIO_REGISTER_MODE_MINOR\n" + "memory\n"); + fprintf(stderr, "\nExample test mod usage:\n"); + fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n"); + fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n"); + + fprintf(stderr, "Examples:\n\n"); + fprintf(stderr, "%s", examples); + exit(1); +} + +#define _err(fmt, ...) \ + do { \ + int ret = errno; \ + fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \ + fprintf(stderr, " (errno=%d, line=%d)\n", \ + ret, __LINE__); \ + } while (0) + +#define errexit(exitcode, fmt, ...) \ + do { \ + _err(fmt, ##__VA_ARGS__); \ + exit(exitcode); \ + } while (0) + +#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__) + +static void uffd_stats_reset(struct uffd_stats *uffd_stats, + unsigned long n_cpus) +{ + int i; + + for (i = 0; i < n_cpus; i++) { + uffd_stats[i].cpu = i; + uffd_stats[i].missing_faults = 0; + uffd_stats[i].wp_faults = 0; + uffd_stats[i].minor_faults = 0; + } +} + +static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) +{ + int i; + unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; + + for (i = 0; i < n_cpus; i++) { + miss_total += stats[i].missing_faults; + wp_total += stats[i].wp_faults; + minor_total += stats[i].minor_faults; + } + + printf("userfaults: "); + if (miss_total) { + printf("%llu missing (", miss_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].missing_faults); + printf("\b) "); + } + if (wp_total) { + printf("%llu wp (", wp_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].wp_faults); + printf("\b) "); + } + if (minor_total) { + printf("%llu minor (", minor_total); + for (i = 0; i < n_cpus; i++) + printf("%lu+", stats[i].minor_faults); + printf("\b)"); + } + printf("\n"); +} + +static void anon_release_pages(char *rel_area) +{ + if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); +} + +static void anon_allocate_area(void **alloc_area, bool is_src) +{ + *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); +} + +static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) +{ +} + +static void hugetlb_release_pages(char *rel_area) +{ + if (!map_shared) { + if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) + err("madvise(MADV_DONTNEED) failed"); + } else { + if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) + err("madvise(MADV_REMOVE) failed"); + } +} + +static void hugetlb_allocate_area(void **alloc_area, bool is_src) +{ + off_t size = nr_pages * page_size; + off_t offset = is_src ? 0 : size; + void *area_alias = NULL; + char **alloc_area_alias; + + *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE, + (map_shared ? MAP_SHARED : MAP_PRIVATE) | + (is_src ? 0 : MAP_NORESERVE), + mem_fd, offset); + if (*alloc_area == MAP_FAILED) + err("mmap of hugetlbfs file failed"); + + if (map_shared) { + area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE, + MAP_SHARED, mem_fd, offset); + if (area_alias == MAP_FAILED) + err("mmap of hugetlb file alias failed"); + } + + if (is_src) { + alloc_area_alias = &area_src_alias; + } else { + alloc_area_alias = &area_dst_alias; + } + if (area_alias) + *alloc_area_alias = area_alias; +} + +static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) +{ + if (!map_shared) + return; + + *start = (unsigned long) area_dst_alias + offset; +} + +static void shmem_release_pages(char *rel_area) +{ + if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) + err("madvise(MADV_REMOVE) failed"); +} + +static void shmem_allocate_area(void **alloc_area, bool is_src) +{ + void *area_alias = NULL; + size_t bytes = nr_pages * page_size; + unsigned long offset = is_src ? 0 : bytes; + char *p = NULL, *p_alias = NULL; + + if (test_collapse) { + p = BASE_PMD_ADDR; + if (!is_src) + /* src map + alias + interleaved hpages */ + p += 2 * (bytes + hpage_size); + p_alias = p; + p_alias += bytes; + p_alias += hpage_size; /* Prevent src/dst VMA merge */ + } + + *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + mem_fd, offset); + if (*alloc_area == MAP_FAILED) + err("mmap of memfd failed"); + if (test_collapse && *alloc_area != p) + err("mmap of memfd failed at %p", p); + + area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + mem_fd, offset); + if (area_alias == MAP_FAILED) + err("mmap of memfd alias failed"); + if (test_collapse && area_alias != p_alias) + err("mmap of anonymous memory failed at %p", p_alias); + + if (is_src) + area_src_alias = area_alias; + else + area_dst_alias = area_alias; +} + +static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) +{ + *start = (unsigned long)area_dst_alias + offset; +} + +static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) +{ + if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size)) + err("Did not find expected %d number of hugepages", + expect_nr_hpages); +} + +struct uffd_test_ops { + void (*allocate_area)(void **alloc_area, bool is_src); + void (*release_pages)(char *rel_area); + void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); + void (*check_pmd_mapping)(void *p, int expect_nr_hpages); +}; + +static struct uffd_test_ops anon_uffd_test_ops = { + .allocate_area = anon_allocate_area, + .release_pages = anon_release_pages, + .alias_mapping = noop_alias_mapping, + .check_pmd_mapping = NULL, +}; + +static struct uffd_test_ops shmem_uffd_test_ops = { + .allocate_area = shmem_allocate_area, + .release_pages = shmem_release_pages, + .alias_mapping = shmem_alias_mapping, + .check_pmd_mapping = shmem_check_pmd_mapping, +}; + +static struct uffd_test_ops hugetlb_uffd_test_ops = { + .allocate_area = hugetlb_allocate_area, + .release_pages = hugetlb_release_pages, + .alias_mapping = hugetlb_alias_mapping, + .check_pmd_mapping = NULL, +}; + +static struct uffd_test_ops *uffd_test_ops; + +static inline uint64_t uffd_minor_feature(void) +{ + if (test_type == TEST_HUGETLB && map_shared) + return UFFD_FEATURE_MINOR_HUGETLBFS; + else if (test_type == TEST_SHMEM) + return UFFD_FEATURE_MINOR_SHMEM; + else + return 0; +} + +static uint64_t get_expected_ioctls(uint64_t mode) +{ + uint64_t ioctls = UFFD_API_RANGE_IOCTLS; + + if (test_type == TEST_HUGETLB) + ioctls &= ~(1 << _UFFDIO_ZEROPAGE); + + if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp)) + ioctls &= ~(1 << _UFFDIO_WRITEPROTECT); + + if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor)) + ioctls &= ~(1 << _UFFDIO_CONTINUE); + + return ioctls; +} + +static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls) +{ + uint64_t expected = get_expected_ioctls(mode); + uint64_t actual = ioctls & expected; + + if (actual != expected) { + err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64, + expected, actual); + } +} + +static int __userfaultfd_open_dev(void) +{ + int fd, _uffd; + + fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); + if (fd < 0) + errexit(KSFT_SKIP, "opening /dev/userfaultfd failed"); + + _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS); + if (_uffd < 0) + errexit(errno == ENOTTY ? KSFT_SKIP : 1, + "creating userfaultfd failed"); + close(fd); + return _uffd; +} + +static void userfaultfd_open(uint64_t *features) +{ + struct uffdio_api uffdio_api; + + if (test_dev_userfaultfd) + uffd = __userfaultfd_open_dev(); + else { + uffd = syscall(__NR_userfaultfd, UFFD_FLAGS); + if (uffd < 0) + errexit(errno == ENOSYS ? KSFT_SKIP : 1, + "creating userfaultfd failed"); + } + uffd_flags = fcntl(uffd, F_GETFD, NULL); + + uffdio_api.api = UFFD_API; + uffdio_api.features = *features; + if (ioctl(uffd, UFFDIO_API, &uffdio_api)) + err("UFFDIO_API failed.\nPlease make sure to " + "run with either root or ptrace capability."); + if (uffdio_api.api != UFFD_API) + err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); + + *features = uffdio_api.features; +} + +static inline void munmap_area(void **area) +{ + if (*area) + if (munmap(*area, nr_pages * page_size)) + err("munmap"); + + *area = NULL; +} + +static void uffd_test_ctx_clear(void) +{ + size_t i; + + if (pipefd) { + for (i = 0; i < nr_cpus * 2; ++i) { + if (close(pipefd[i])) + err("close pipefd"); + } + free(pipefd); + pipefd = NULL; + } + + if (count_verify) { + free(count_verify); + count_verify = NULL; + } + + if (uffd != -1) { + if (close(uffd)) + err("close uffd"); + uffd = -1; + } + + munmap_area((void **)&area_src); + munmap_area((void **)&area_src_alias); + munmap_area((void **)&area_dst); + munmap_area((void **)&area_dst_alias); + munmap_area((void **)&area_remap); +} + +static void uffd_test_ctx_init(uint64_t features) +{ + unsigned long nr, cpu; + + uffd_test_ctx_clear(); + + uffd_test_ops->allocate_area((void **)&area_src, true); + uffd_test_ops->allocate_area((void **)&area_dst, false); + + userfaultfd_open(&features); + + count_verify = malloc(nr_pages * sizeof(unsigned long long)); + if (!count_verify) + err("count_verify"); + + for (nr = 0; nr < nr_pages; nr++) { + *area_mutex(area_src, nr) = + (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; + count_verify[nr] = *area_count(area_src, nr) = 1; + /* + * In the transition between 255 to 256, powerpc will + * read out of order in my_bcmp and see both bytes as + * zero, so leave a placeholder below always non-zero + * after the count, to avoid my_bcmp to trigger false + * positives. + */ + *(area_count(area_src, nr) + 1) = 1; + } + + /* + * After initialization of area_src, we must explicitly release pages + * for area_dst to make sure it's fully empty. Otherwise we could have + * some area_dst pages be errornously initialized with zero pages, + * hence we could hit memory corruption later in the test. + * + * One example is when THP is globally enabled, above allocate_area() + * calls could have the two areas merged into a single VMA (as they + * will have the same VMA flags so they're mergeable). When we + * initialize the area_src above, it's possible that some part of + * area_dst could have been faulted in via one huge THP that will be + * shared between area_src and area_dst. It could cause some of the + * area_dst won't be trapped by missing userfaults. + * + * This release_pages() will guarantee even if that happened, we'll + * proactively split the thp and drop any accidentally initialized + * pages within area_dst. + */ + uffd_test_ops->release_pages(area_dst); + + pipefd = malloc(sizeof(int) * nr_cpus * 2); + if (!pipefd) + err("pipefd"); + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) + err("pipe"); +} + +static int my_bcmp(char *str1, char *str2, size_t n) +{ + unsigned long i; + for (i = 0; i < n; i++) + if (str1[i] != str2[i]) + return 1; + return 0; +} + +static void wp_range(int ufd, __u64 start, __u64 len, bool wp) +{ + struct uffdio_writeprotect prms; + + /* Write protection page faults */ + prms.range.start = start; + prms.range.len = len; + /* Undo write-protect, do wakeup after that */ + prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; + + if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) + err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); +} + +static void continue_range(int ufd, __u64 start, __u64 len) +{ + struct uffdio_continue req; + int ret; + + req.range.start = start; + req.range.len = len; + req.mode = 0; + + if (ioctl(ufd, UFFDIO_CONTINUE, &req)) + err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, + (uint64_t)start); + + /* + * Error handling within the kernel for continue is subtly different + * from copy or zeropage, so it may be a source of bugs. Trigger an + * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. + */ + req.mapped = 0; + ret = ioctl(ufd, UFFDIO_CONTINUE, &req); + if (ret >= 0 || req.mapped != -EEXIST) + err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, + ret, (int64_t) req.mapped); +} + +static void *locking_thread(void *arg) +{ + unsigned long cpu = (unsigned long) arg; + unsigned long page_nr; + unsigned long long count; + + if (!(bounces & BOUNCE_RANDOM)) { + page_nr = -bounces; + if (!(bounces & BOUNCE_RACINGFAULTS)) + page_nr += cpu * nr_pages_per_cpu; + } + + while (!finished) { + if (bounces & BOUNCE_RANDOM) { + if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr)) + err("getrandom failed"); + } else + page_nr += 1; + page_nr %= nr_pages; + pthread_mutex_lock(area_mutex(area_dst, page_nr)); + count = *area_count(area_dst, page_nr); + if (count != count_verify[page_nr]) + err("page_nr %lu memory corruption %llu %llu", + page_nr, count, count_verify[page_nr]); + count++; + *area_count(area_dst, page_nr) = count_verify[page_nr] = count; + pthread_mutex_unlock(area_mutex(area_dst, page_nr)); + } + + return NULL; +} + +static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, + unsigned long offset) +{ + uffd_test_ops->alias_mapping(&uffdio_copy->dst, + uffdio_copy->len, + offset); + if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { + /* real retval in ufdio_copy.copy */ + if (uffdio_copy->copy != -EEXIST) + err("UFFDIO_COPY retry error: %"PRId64, + (int64_t)uffdio_copy->copy); + } else { + err("UFFDIO_COPY retry unexpected: %"PRId64, + (int64_t)uffdio_copy->copy); + } +} + +static void wake_range(int ufd, unsigned long addr, unsigned long len) +{ + struct uffdio_range uffdio_wake; + + uffdio_wake.start = addr; + uffdio_wake.len = len; + + if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) + fprintf(stderr, "error waking %lu\n", + addr), exit(1); +} + +static int __copy_page(int ufd, unsigned long offset, bool retry) +{ + struct uffdio_copy uffdio_copy; + + if (offset >= nr_pages * page_size) + err("unexpected offset %lu\n", offset); + uffdio_copy.dst = (unsigned long) area_dst + offset; + uffdio_copy.src = (unsigned long) area_src + offset; + uffdio_copy.len = page_size; + if (test_uffdio_wp) + uffdio_copy.mode = UFFDIO_COPY_MODE_WP; + else + uffdio_copy.mode = 0; + uffdio_copy.copy = 0; + if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { + /* real retval in ufdio_copy.copy */ + if (uffdio_copy.copy != -EEXIST) + err("UFFDIO_COPY error: %"PRId64, + (int64_t)uffdio_copy.copy); + wake_range(ufd, uffdio_copy.dst, page_size); + } else if (uffdio_copy.copy != page_size) { + err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); + } else { + if (test_uffdio_copy_eexist && retry) { + test_uffdio_copy_eexist = false; + retry_copy_page(ufd, &uffdio_copy, offset); + } + return 1; + } + return 0; +} + +static int copy_page_retry(int ufd, unsigned long offset) +{ + return __copy_page(ufd, offset, true); +} + +static int copy_page(int ufd, unsigned long offset) +{ + return __copy_page(ufd, offset, false); +} + +static int uffd_read_msg(int ufd, struct uffd_msg *msg) +{ + int ret = read(uffd, msg, sizeof(*msg)); + + if (ret != sizeof(*msg)) { + if (ret < 0) { + if (errno == EAGAIN || errno == EINTR) + return 1; + err("blocking read error"); + } else { + err("short read"); + } + } + + return 0; +} + +static void uffd_handle_page_fault(struct uffd_msg *msg, + struct uffd_stats *stats) +{ + unsigned long offset; + + if (msg->event != UFFD_EVENT_PAGEFAULT) + err("unexpected msg event %u", msg->event); + + if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { + /* Write protect page faults */ + wp_range(uffd, msg->arg.pagefault.address, page_size, false); + stats->wp_faults++; + } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { + uint8_t *area; + int b; + + /* + * Minor page faults + * + * To prove we can modify the original range for testing + * purposes, we're going to bit flip this range before + * continuing. + * + * Note that this requires all minor page fault tests operate on + * area_dst (non-UFFD-registered) and area_dst_alias + * (UFFD-registered). + */ + + area = (uint8_t *)(area_dst + + ((char *)msg->arg.pagefault.address - + area_dst_alias)); + for (b = 0; b < page_size; ++b) + area[b] = ~area[b]; + continue_range(uffd, msg->arg.pagefault.address, page_size); + stats->minor_faults++; + } else { + /* + * Missing page faults. + * + * Here we force a write check for each of the missing mode + * faults. It's guaranteed because the only threads that + * will trigger uffd faults are the locking threads, and + * their first instruction to touch the missing page will + * always be pthread_mutex_lock(). + * + * Note that here we relied on an NPTL glibc impl detail to + * always read the lock type at the entry of the lock op + * (pthread_mutex_t.__data.__type, offset 0x10) before + * doing any locking operations to guarantee that. It's + * actually not good to rely on this impl detail because + * logically a pthread-compatible lib can implement the + * locks without types and we can fail when linking with + * them. However since we used to find bugs with this + * strict check we still keep it around. Hopefully this + * could be a good hint when it fails again. If one day + * it'll break on some other impl of glibc we'll revisit. + */ + if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) + err("unexpected write fault"); + + offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; + offset &= ~(page_size-1); + + if (copy_page(uffd, offset)) + stats->missing_faults++; + } +} + +static void *uffd_poll_thread(void *arg) +{ + struct uffd_stats *stats = (struct uffd_stats *)arg; + unsigned long cpu = stats->cpu; + struct pollfd pollfd[2]; + struct uffd_msg msg; + struct uffdio_register uffd_reg; + int ret; + char tmp_chr; + + pollfd[0].fd = uffd; + pollfd[0].events = POLLIN; + pollfd[1].fd = pipefd[cpu*2]; + pollfd[1].events = POLLIN; + + for (;;) { + ret = poll(pollfd, 2, -1); + if (ret <= 0) { + if (errno == EINTR || errno == EAGAIN) + continue; + err("poll error: %d", ret); + } + if (pollfd[1].revents & POLLIN) { + if (read(pollfd[1].fd, &tmp_chr, 1) != 1) + err("read pipefd error"); + break; + } + if (!(pollfd[0].revents & POLLIN)) + err("pollfd[0].revents %d", pollfd[0].revents); + if (uffd_read_msg(uffd, &msg)) + continue; + switch (msg.event) { + default: + err("unexpected msg event %u\n", msg.event); + break; + case UFFD_EVENT_PAGEFAULT: + uffd_handle_page_fault(&msg, stats); + break; + case UFFD_EVENT_FORK: + close(uffd); + uffd = msg.arg.fork.ufd; + pollfd[0].fd = uffd; + break; + case UFFD_EVENT_REMOVE: + uffd_reg.range.start = msg.arg.remove.start; + uffd_reg.range.len = msg.arg.remove.end - + msg.arg.remove.start; + if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) + err("remove failure"); + break; + case UFFD_EVENT_REMAP: + area_remap = area_dst; /* save for later unmap */ + area_dst = (char *)(unsigned long)msg.arg.remap.to; + break; + } + } + + return NULL; +} + +pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; + +static void *uffd_read_thread(void *arg) +{ + struct uffd_stats *stats = (struct uffd_stats *)arg; + struct uffd_msg msg; + + pthread_mutex_unlock(&uffd_read_mutex); + /* from here cancellation is ok */ + + for (;;) { + if (uffd_read_msg(uffd, &msg)) + continue; + uffd_handle_page_fault(&msg, stats); + } + + return NULL; +} + +static void *background_thread(void *arg) +{ + unsigned long cpu = (unsigned long) arg; + unsigned long page_nr, start_nr, mid_nr, end_nr; + + start_nr = cpu * nr_pages_per_cpu; + end_nr = (cpu+1) * nr_pages_per_cpu; + mid_nr = (start_nr + end_nr) / 2; + + /* Copy the first half of the pages */ + for (page_nr = start_nr; page_nr < mid_nr; page_nr++) + copy_page_retry(uffd, page_nr * page_size); + + /* + * If we need to test uffd-wp, set it up now. Then we'll have + * at least the first half of the pages mapped already which + * can be write-protected for testing + */ + if (test_uffdio_wp) + wp_range(uffd, (unsigned long)area_dst + start_nr * page_size, + nr_pages_per_cpu * page_size, true); + + /* + * Continue the 2nd half of the page copying, handling write + * protection faults if any + */ + for (page_nr = mid_nr; page_nr < end_nr; page_nr++) + copy_page_retry(uffd, page_nr * page_size); + + return NULL; +} + +static int stress(struct uffd_stats *uffd_stats) +{ + unsigned long cpu; + pthread_t locking_threads[nr_cpus]; + pthread_t uffd_threads[nr_cpus]; + pthread_t background_threads[nr_cpus]; + + finished = 0; + for (cpu = 0; cpu < nr_cpus; cpu++) { + if (pthread_create(&locking_threads[cpu], &attr, + locking_thread, (void *)cpu)) + return 1; + if (bounces & BOUNCE_POLL) { + if (pthread_create(&uffd_threads[cpu], &attr, + uffd_poll_thread, + (void *)&uffd_stats[cpu])) + return 1; + } else { + if (pthread_create(&uffd_threads[cpu], &attr, + uffd_read_thread, + (void *)&uffd_stats[cpu])) + return 1; + pthread_mutex_lock(&uffd_read_mutex); + } + if (pthread_create(&background_threads[cpu], &attr, + background_thread, (void *)cpu)) + return 1; + } + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pthread_join(background_threads[cpu], NULL)) + return 1; + + /* + * Be strict and immediately zap area_src, the whole area has + * been transferred already by the background treads. The + * area_src could then be faulted in a racy way by still + * running uffdio_threads reading zeropages after we zapped + * area_src (but they're guaranteed to get -EEXIST from + * UFFDIO_COPY without writing zero pages into area_dst + * because the background threads already completed). + */ + uffd_test_ops->release_pages(area_src); + + finished = 1; + for (cpu = 0; cpu < nr_cpus; cpu++) + if (pthread_join(locking_threads[cpu], NULL)) + return 1; + + for (cpu = 0; cpu < nr_cpus; cpu++) { + char c; + if (bounces & BOUNCE_POLL) { + if (write(pipefd[cpu*2+1], &c, 1) != 1) + err("pipefd write error"); + if (pthread_join(uffd_threads[cpu], + (void *)&uffd_stats[cpu])) + return 1; + } else { + if (pthread_cancel(uffd_threads[cpu])) + return 1; + if (pthread_join(uffd_threads[cpu], NULL)) + return 1; + } + } + + return 0; +} + +sigjmp_buf jbuf, *sigbuf; + +static void sighndl(int sig, siginfo_t *siginfo, void *ptr) +{ + if (sig == SIGBUS) { + if (sigbuf) + siglongjmp(*sigbuf, 1); + abort(); + } +} + +/* + * For non-cooperative userfaultfd test we fork() a process that will + * generate pagefaults, will mremap the area monitored by the + * userfaultfd and at last this process will release the monitored + * area. + * For the anonymous and shared memory the area is divided into two + * parts, the first part is accessed before mremap, and the second + * part is accessed after mremap. Since hugetlbfs does not support + * mremap, the entire monitored area is accessed in a single pass for + * HUGETLB_TEST. + * The release of the pages currently generates event for shmem and + * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked + * for hugetlb. + * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register + * monitored area, generate pagefaults and test that signal is delivered. + * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2 + * test robustness use case - we release monitored area, fork a process + * that will generate pagefaults and verify signal is generated. + * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal + * feature. Using monitor thread, verify no userfault events are generated. + */ +static int faulting_process(int signal_test) +{ + unsigned long nr; + unsigned long long count; + unsigned long split_nr_pages; + unsigned long lastnr; + struct sigaction act; + volatile unsigned long signalled = 0; + + split_nr_pages = (nr_pages + 1) / 2; + + if (signal_test) { + sigbuf = &jbuf; + memset(&act, 0, sizeof(act)); + act.sa_sigaction = sighndl; + act.sa_flags = SA_SIGINFO; + if (sigaction(SIGBUS, &act, 0)) + err("sigaction"); + lastnr = (unsigned long)-1; + } + + for (nr = 0; nr < split_nr_pages; nr++) { + volatile int steps = 1; + unsigned long offset = nr * page_size; + + if (signal_test) { + if (sigsetjmp(*sigbuf, 1) != 0) { + if (steps == 1 && nr == lastnr) + err("Signal repeated"); + + lastnr = nr; + if (signal_test == 1) { + if (steps == 1) { + /* This is a MISSING request */ + steps++; + if (copy_page(uffd, offset)) + signalled++; + } else { + /* This is a WP request */ + assert(steps == 2); + wp_range(uffd, + (__u64)area_dst + + offset, + page_size, false); + } + } else { + signalled++; + continue; + } + } + } + + count = *area_count(area_dst, nr); + if (count != count_verify[nr]) + err("nr %lu memory corruption %llu %llu\n", + nr, count, count_verify[nr]); + /* + * Trigger write protection if there is by writing + * the same value back. + */ + *area_count(area_dst, nr) = count; + } + + if (signal_test) + return signalled != split_nr_pages; + + area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, + MREMAP_MAYMOVE | MREMAP_FIXED, area_src); + if (area_dst == MAP_FAILED) + err("mremap"); + /* Reset area_src since we just clobbered it */ + area_src = NULL; + + for (; nr < nr_pages; nr++) { + count = *area_count(area_dst, nr); + if (count != count_verify[nr]) { + err("nr %lu memory corruption %llu %llu\n", + nr, count, count_verify[nr]); + } + /* + * Trigger write protection if there is by writing + * the same value back. + */ + *area_count(area_dst, nr) = count; + } + + uffd_test_ops->release_pages(area_dst); + + for (nr = 0; nr < nr_pages; nr++) + if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) + err("nr %lu is not zero", nr); + + return 0; +} + +static void retry_uffdio_zeropage(int ufd, + struct uffdio_zeropage *uffdio_zeropage, + unsigned long offset) +{ + uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start, + uffdio_zeropage->range.len, + offset); + if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) { + if (uffdio_zeropage->zeropage != -EEXIST) + err("UFFDIO_ZEROPAGE error: %"PRId64, + (int64_t)uffdio_zeropage->zeropage); + } else { + err("UFFDIO_ZEROPAGE error: %"PRId64, + (int64_t)uffdio_zeropage->zeropage); + } +} + +static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) +{ + struct uffdio_zeropage uffdio_zeropage; + int ret; + bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE); + __s64 res; + + if (offset >= nr_pages * page_size) + err("unexpected offset %lu", offset); + uffdio_zeropage.range.start = (unsigned long) area_dst + offset; + uffdio_zeropage.range.len = page_size; + uffdio_zeropage.mode = 0; + ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage); + res = uffdio_zeropage.zeropage; + if (ret) { + /* real retval in ufdio_zeropage.zeropage */ + if (has_zeropage) + err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res); + else if (res != -EINVAL) + err("UFFDIO_ZEROPAGE not -EINVAL"); + } else if (has_zeropage) { + if (res != page_size) { + err("UFFDIO_ZEROPAGE unexpected size"); + } else { + if (test_uffdio_zeropage_eexist && retry) { + test_uffdio_zeropage_eexist = false; + retry_uffdio_zeropage(ufd, &uffdio_zeropage, + offset); + } + return 1; + } + } else + err("UFFDIO_ZEROPAGE succeeded"); + + return 0; +} + +static int uffdio_zeropage(int ufd, unsigned long offset) +{ + return __uffdio_zeropage(ufd, offset, false); +} + +/* exercise UFFDIO_ZEROPAGE */ +static int userfaultfd_zeropage_test(void) +{ + struct uffdio_register uffdio_register; + + printf("testing UFFDIO_ZEROPAGE: "); + fflush(stdout); + + uffd_test_ctx_init(0); + + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); + + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); + + if (uffdio_zeropage(uffd, 0)) + if (my_bcmp(area_dst, zeropage, page_size)) + err("zeropage is not zero"); + + printf("done.\n"); + return 0; +} + +static int userfaultfd_events_test(void) +{ + struct uffdio_register uffdio_register; + pthread_t uffd_mon; + int err, features; + pid_t pid; + char c; + struct uffd_stats stats = { 0 }; + + printf("testing events (fork, remap, remove): "); + fflush(stdout); + + features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | + UFFD_FEATURE_EVENT_REMOVE; + uffd_test_ctx_init(features); + + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); + + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); + + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) + err("uffd_poll_thread create"); + + pid = fork(); + if (pid < 0) + err("fork"); + + if (!pid) + exit(faulting_process(0)); + + waitpid(pid, &err, 0); + if (err) + err("faulting process failed"); + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); + if (pthread_join(uffd_mon, NULL)) + return 1; + + uffd_stats_report(&stats, 1); + + return stats.missing_faults != nr_pages; +} + +static int userfaultfd_sig_test(void) +{ + struct uffdio_register uffdio_register; + unsigned long userfaults; + pthread_t uffd_mon; + int err, features; + pid_t pid; + char c; + struct uffd_stats stats = { 0 }; + + printf("testing signal delivery: "); + fflush(stdout); + + features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS; + uffd_test_ctx_init(features); + + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); + + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); + + if (faulting_process(1)) + err("faulting process failed"); + + uffd_test_ops->release_pages(area_dst); + + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) + err("uffd_poll_thread create"); + + pid = fork(); + if (pid < 0) + err("fork"); + + if (!pid) + exit(faulting_process(2)); + + waitpid(pid, &err, 0); + if (err) + err("faulting process failed"); + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); + if (pthread_join(uffd_mon, (void **)&userfaults)) + return 1; + + printf("done.\n"); + if (userfaults) + err("Signal test failed, userfaults: %ld", userfaults); + + return userfaults != 0; +} + +void check_memory_contents(char *p) +{ + unsigned long i; + uint8_t expected_byte; + void *expected_page; + + if (posix_memalign(&expected_page, page_size, page_size)) + err("out of memory"); + + for (i = 0; i < nr_pages; ++i) { + expected_byte = ~((uint8_t)(i % ((uint8_t)-1))); + memset(expected_page, expected_byte, page_size); + if (my_bcmp(expected_page, p + (i * page_size), page_size)) + err("unexpected page contents after minor fault"); + } + + free(expected_page); +} + +static int userfaultfd_minor_test(void) +{ + unsigned long p; + struct uffdio_register uffdio_register; + pthread_t uffd_mon; + char c; + struct uffd_stats stats = { 0 }; + + if (!test_uffdio_minor) + return 0; + + printf("testing minor faults: "); + fflush(stdout); + + uffd_test_ctx_init(uffd_minor_feature()); + + uffdio_register.range.start = (unsigned long)area_dst_alias; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); + + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); + + /* + * After registering with UFFD, populate the non-UFFD-registered side of + * the shared mapping. This should *not* trigger any UFFD minor faults. + */ + for (p = 0; p < nr_pages; ++p) { + memset(area_dst + (p * page_size), p % ((uint8_t)-1), + page_size); + } + + if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) + err("uffd_poll_thread create"); + + /* + * Read each of the pages back using the UFFD-registered mapping. We + * expect that the first time we touch a page, it will result in a minor + * fault. uffd_poll_thread will resolve the fault by bit-flipping the + * page's contents, and then issuing a CONTINUE ioctl. + */ + check_memory_contents(area_dst_alias); + + if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) + err("pipe write"); + if (pthread_join(uffd_mon, NULL)) + return 1; + + uffd_stats_report(&stats, 1); + + if (test_collapse) { + printf("testing collapse of uffd memory into PMD-mapped THPs:"); + if (madvise(area_dst_alias, nr_pages * page_size, + MADV_COLLAPSE)) + err("madvise(MADV_COLLAPSE)"); + + uffd_test_ops->check_pmd_mapping(area_dst, + nr_pages * page_size / + hpage_size); + /* + * This won't cause uffd-fault - it purely just makes sure there + * was no corruption. + */ + check_memory_contents(area_dst_alias); + printf(" done.\n"); + } + + return stats.missing_faults != 0 || stats.minor_faults != nr_pages; +} + +#define BIT_ULL(nr) (1ULL << (nr)) +#define PM_SOFT_DIRTY BIT_ULL(55) +#define PM_MMAP_EXCLUSIVE BIT_ULL(56) +#define PM_UFFD_WP BIT_ULL(57) +#define PM_FILE BIT_ULL(61) +#define PM_SWAP BIT_ULL(62) +#define PM_PRESENT BIT_ULL(63) + +static int pagemap_open(void) +{ + int fd = open("/proc/self/pagemap", O_RDONLY); + + if (fd < 0) + err("open pagemap"); + + return fd; +} + +static uint64_t pagemap_read_vaddr(int fd, void *vaddr) +{ + uint64_t value; + int ret; + + ret = pread(fd, &value, sizeof(uint64_t), + ((uint64_t)vaddr >> 12) * sizeof(uint64_t)); + if (ret != sizeof(uint64_t)) + err("pread() on pagemap failed"); + + return value; +} + +/* This macro let __LINE__ works in err() */ +#define pagemap_check_wp(value, wp) do { \ + if (!!(value & PM_UFFD_WP) != wp) \ + err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \ + } while (0) + +static int pagemap_test_fork(bool present) +{ + pid_t child = fork(); + uint64_t value; + int fd, result; + + if (!child) { + /* Open the pagemap fd of the child itself */ + fd = pagemap_open(); + value = pagemap_read_vaddr(fd, area_dst); + /* + * After fork() uffd-wp bit should be gone as long as we're + * without UFFD_FEATURE_EVENT_FORK + */ + pagemap_check_wp(value, false); + /* Succeed */ + exit(0); + } + waitpid(child, &result, 0); + return result; +} + +static void userfaultfd_pagemap_test(unsigned int test_pgsize) +{ + struct uffdio_register uffdio_register; + int pagemap_fd; + uint64_t value; + + /* Pagemap tests uffd-wp only */ + if (!test_uffdio_wp) + return; + + /* Not enough memory to test this page size */ + if (test_pgsize > nr_pages * page_size) + return; + + printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize); + /* Flush so it doesn't flush twice in parent/child later */ + fflush(stdout); + + uffd_test_ctx_init(0); + + if (test_pgsize > page_size) { + /* This is a thp test */ + if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE)) + err("madvise(MADV_HUGEPAGE) failed"); + } else if (test_pgsize == page_size) { + /* This is normal page test; force no thp */ + if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE)) + err("madvise(MADV_NOHUGEPAGE) failed"); + } + + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failed"); + + pagemap_fd = pagemap_open(); + + /* Touch the page */ + *area_dst = 1; + wp_range(uffd, (uint64_t)area_dst, test_pgsize, true); + value = pagemap_read_vaddr(pagemap_fd, area_dst); + pagemap_check_wp(value, true); + /* Make sure uffd-wp bit dropped when fork */ + if (pagemap_test_fork(true)) + err("Detected stall uffd-wp bit in child"); + + /* Exclusive required or PAGEOUT won't work */ + if (!(value & PM_MMAP_EXCLUSIVE)) + err("multiple mapping detected: 0x%"PRIx64, value); + + if (madvise(area_dst, test_pgsize, MADV_PAGEOUT)) + err("madvise(MADV_PAGEOUT) failed"); + + /* Uffd-wp should persist even swapped out */ + value = pagemap_read_vaddr(pagemap_fd, area_dst); + pagemap_check_wp(value, true); + /* Make sure uffd-wp bit dropped when fork */ + if (pagemap_test_fork(false)) + err("Detected stall uffd-wp bit in child"); + + /* Unprotect; this tests swap pte modifications */ + wp_range(uffd, (uint64_t)area_dst, page_size, false); + value = pagemap_read_vaddr(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + + /* Fault in the page from disk */ + *area_dst = 2; + value = pagemap_read_vaddr(pagemap_fd, area_dst); + pagemap_check_wp(value, false); + + close(pagemap_fd); + printf("done\n"); +} + +static int userfaultfd_stress(void) +{ + void *area; + unsigned long nr; + struct uffdio_register uffdio_register; + struct uffd_stats uffd_stats[nr_cpus]; + + uffd_test_ctx_init(0); + + if (posix_memalign(&area, page_size, page_size)) + err("out of memory"); + zeropage = area; + bzero(zeropage, page_size); + + pthread_mutex_lock(&uffd_read_mutex); + + pthread_attr_init(&attr); + pthread_attr_setstacksize(&attr, 16*1024*1024); + + while (bounces--) { + printf("bounces: %d, mode:", bounces); + if (bounces & BOUNCE_RANDOM) + printf(" rnd"); + if (bounces & BOUNCE_RACINGFAULTS) + printf(" racing"); + if (bounces & BOUNCE_VERIFY) + printf(" ver"); + if (bounces & BOUNCE_POLL) + printf(" poll"); + else + printf(" read"); + printf(", "); + fflush(stdout); + + if (bounces & BOUNCE_POLL) + fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); + else + fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK); + + /* register */ + uffdio_register.range.start = (unsigned long) area_dst; + uffdio_register.range.len = nr_pages * page_size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (test_uffdio_wp) + uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure"); + assert_expected_ioctls_present( + uffdio_register.mode, uffdio_register.ioctls); + + if (area_dst_alias) { + uffdio_register.range.start = (unsigned long) + area_dst_alias; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) + err("register failure alias"); + } + + /* + * The madvise done previously isn't enough: some + * uffd_thread could have read userfaults (one of + * those already resolved by the background thread) + * and it may be in the process of calling + * UFFDIO_COPY. UFFDIO_COPY will read the zapped + * area_src and it would map a zero page in it (of + * course such a UFFDIO_COPY is perfectly safe as it'd + * return -EEXIST). The problem comes at the next + * bounce though: that racing UFFDIO_COPY would + * generate zeropages in the area_src, so invalidating + * the previous MADV_DONTNEED. Without this additional + * MADV_DONTNEED those zeropages leftovers in the + * area_src would lead to -EEXIST failure during the + * next bounce, effectively leaving a zeropage in the + * area_dst. + * + * Try to comment this out madvise to see the memory + * corruption being caught pretty quick. + * + * khugepaged is also inhibited to collapse THP after + * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's + * required to MADV_DONTNEED here. + */ + uffd_test_ops->release_pages(area_dst); + + uffd_stats_reset(uffd_stats, nr_cpus); + + /* bounce pass */ + if (stress(uffd_stats)) + return 1; + + /* Clear all the write protections if there is any */ + if (test_uffdio_wp) + wp_range(uffd, (unsigned long)area_dst, + nr_pages * page_size, false); + + /* unregister */ + if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) + err("unregister failure"); + if (area_dst_alias) { + uffdio_register.range.start = (unsigned long) area_dst; + if (ioctl(uffd, UFFDIO_UNREGISTER, + &uffdio_register.range)) + err("unregister failure alias"); + } + + /* verification */ + if (bounces & BOUNCE_VERIFY) + for (nr = 0; nr < nr_pages; nr++) + if (*area_count(area_dst, nr) != count_verify[nr]) + err("error area_count %llu %llu %lu\n", + *area_count(area_src, nr), + count_verify[nr], nr); + + /* prepare next bounce */ + swap(area_src, area_dst); + + swap(area_src_alias, area_dst_alias); + + uffd_stats_report(uffd_stats, nr_cpus); + } + + if (test_type == TEST_ANON) { + /* + * shmem/hugetlb won't be able to run since they have different + * behavior on fork() (file-backed memory normally drops ptes + * directly when fork), meanwhile the pagemap test will verify + * pgtable entry of fork()ed child. + */ + userfaultfd_pagemap_test(page_size); + /* + * Hard-code for x86_64 for now for 2M THP, as x86_64 is + * currently the only one that supports uffd-wp + */ + userfaultfd_pagemap_test(page_size * 512); + } + + return userfaultfd_zeropage_test() || userfaultfd_sig_test() + || userfaultfd_events_test() || userfaultfd_minor_test(); +} + +/* + * Copied from mlock2-tests.c + */ +unsigned long default_huge_page_size(void) +{ + unsigned long hps = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return 0; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { + hps <<= 10; + break; + } + } + + free(line); + fclose(f); + return hps; +} + +static void set_test_type(const char *type) +{ + if (!strcmp(type, "anon")) { + test_type = TEST_ANON; + uffd_test_ops = &anon_uffd_test_ops; + } else if (!strcmp(type, "hugetlb")) { + test_type = TEST_HUGETLB; + uffd_test_ops = &hugetlb_uffd_test_ops; + } else if (!strcmp(type, "hugetlb_shared")) { + map_shared = true; + test_type = TEST_HUGETLB; + uffd_test_ops = &hugetlb_uffd_test_ops; + /* Minor faults require shared hugetlb; only enable here. */ + test_uffdio_minor = true; + } else if (!strcmp(type, "shmem")) { + map_shared = true; + test_type = TEST_SHMEM; + uffd_test_ops = &shmem_uffd_test_ops; + test_uffdio_minor = true; + } +} + +static void parse_test_type_arg(const char *raw_type) +{ + char *buf = strdup(raw_type); + uint64_t features = UFFD_API_FEATURES; + + while (buf) { + const char *token = strsep(&buf, ":"); + + if (!test_type) + set_test_type(token); + else if (!strcmp(token, "dev")) + test_dev_userfaultfd = true; + else if (!strcmp(token, "syscall")) + test_dev_userfaultfd = false; + else if (!strcmp(token, "collapse")) + test_collapse = true; + else + err("unrecognized test mod '%s'", token); + } + + if (!test_type) + err("failed to parse test type argument: '%s'", raw_type); + + if (test_collapse && test_type != TEST_SHMEM) + err("Unsupported test: %s", raw_type); + + if (test_type == TEST_HUGETLB) + page_size = hpage_size; + else + page_size = sysconf(_SC_PAGE_SIZE); + + if (!page_size) + err("Unable to determine page size"); + if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 + > page_size) + err("Impossible to run this test"); + + /* + * Whether we can test certain features depends not just on test type, + * but also on whether or not this particular kernel supports the + * feature. + */ + + userfaultfd_open(&features); + + test_uffdio_wp = test_uffdio_wp && + (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP); + test_uffdio_minor = test_uffdio_minor && + (features & uffd_minor_feature()); + + close(uffd); + uffd = -1; +} + +static void sigalrm(int sig) +{ + if (sig != SIGALRM) + abort(); + test_uffdio_copy_eexist = true; + test_uffdio_zeropage_eexist = true; + alarm(ALARM_INTERVAL_SECS); +} + +int main(int argc, char **argv) +{ + size_t bytes; + + if (argc < 4) + usage(); + + if (signal(SIGALRM, sigalrm) == SIG_ERR) + err("failed to arm SIGALRM"); + alarm(ALARM_INTERVAL_SECS); + + hpage_size = default_huge_page_size(); + parse_test_type_arg(argv[1]); + bytes = atol(argv[2]) * 1024 * 1024; + + if (test_collapse && bytes & (hpage_size - 1)) + err("MiB must be multiple of %lu if :collapse mod set", + hpage_size >> 20); + + nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + + if (test_collapse) { + /* nr_cpus must divide (bytes / page_size), otherwise, + * area allocations of (nr_pages * paze_size) won't be a + * multiple of hpage_size, even if bytes is a multiple of + * hpage_size. + * + * This means that nr_cpus must divide (N * (2 << (H-P)) + * where: + * bytes = hpage_size * N + * hpage_size = 2 << H + * page_size = 2 << P + * + * And we want to chose nr_cpus to be the largest value + * satisfying this constraint, not larger than the number + * of online CPUs. Unfortunately, prime factorization of + * N and nr_cpus may be arbitrary, so have to search for it. + * Instead, just use the highest power of 2 dividing both + * nr_cpus and (bytes / page_size). + */ + int x = factor_of_2(nr_cpus); + int y = factor_of_2(bytes / page_size); + + nr_cpus = x < y ? x : y; + } + nr_pages_per_cpu = bytes / page_size / nr_cpus; + if (!nr_pages_per_cpu) { + _err("invalid MiB"); + usage(); + } + + bounces = atoi(argv[3]); + if (bounces <= 0) { + _err("invalid bounces"); + usage(); + } + nr_pages = nr_pages_per_cpu * nr_cpus; + + if (test_type == TEST_SHMEM || test_type == TEST_HUGETLB) { + unsigned int memfd_flags = 0; + + if (test_type == TEST_HUGETLB) + memfd_flags = MFD_HUGETLB; + mem_fd = memfd_create(argv[0], memfd_flags); + if (mem_fd < 0) + err("memfd_create"); + if (ftruncate(mem_fd, nr_pages * page_size * 2)) + err("ftruncate"); + if (fallocate(mem_fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, + nr_pages * page_size * 2)) + err("fallocate"); + } + printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", + nr_pages, nr_pages_per_cpu); + return userfaultfd_stress(); +} + +#else /* __NR_userfaultfd */ + +#warning "missing __NR_userfaultfd definition" + +int main(void) +{ + printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n"); + return KSFT_SKIP; +} + +#endif /* __NR_userfaultfd */ --- /dev/null +++ a/tools/testing/selftests/mm/util.h @@ -0,0 +1,69 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __KSELFTEST_VM_UTIL_H +#define __KSELFTEST_VM_UTIL_H + +#include <stdint.h> +#include <sys/mman.h> +#include <err.h> +#include <string.h> /* ffsl() */ +#include <unistd.h> /* _SC_PAGESIZE */ + +static unsigned int __page_size; +static unsigned int __page_shift; + +static inline unsigned int page_size(void) +{ + if (!__page_size) + __page_size = sysconf(_SC_PAGESIZE); + return __page_size; +} + +static inline unsigned int page_shift(void) +{ + if (!__page_shift) + __page_shift = (ffsl(page_size()) - 1); + return __page_shift; +} + +#define PAGE_SHIFT (page_shift()) +#define PAGE_SIZE (page_size()) +/* + * On ppc64 this will only work with radix 2M hugepage size + */ +#define HPAGE_SHIFT 21 +#define HPAGE_SIZE (1 << HPAGE_SHIFT) + +#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0) +#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1)) + + +static inline int64_t allocate_transhuge(void *ptr, int pagemap_fd) +{ + uint64_t ent[2]; + + /* drop pmd */ + if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANONYMOUS | + MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr) + errx(2, "mmap transhuge"); + + if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE)) + err(2, "MADV_HUGEPAGE"); + + /* allocate transparent huge page */ + *(volatile void **)ptr = ptr; + + if (pread(pagemap_fd, ent, sizeof(ent), + (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent)) + err(2, "read pagemap"); + + if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) && + PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) && + !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1))) + return PAGEMAP_PFN(ent[0]); + + return -1; +} + +#endif --- /dev/null +++ a/tools/testing/selftests/mm/va_128TBswitch.c @@ -0,0 +1,289 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * + * Authors: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx> + * Authors: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxxxxxxx> + */ + +#include <stdio.h> +#include <sys/mman.h> +#include <string.h> + +#include "../kselftest.h" + +#ifdef __powerpc64__ +#define PAGE_SIZE (64 << 10) +/* + * This will work with 16M and 2M hugepage size + */ +#define HUGETLB_SIZE (16 << 20) +#else +#define PAGE_SIZE (4 << 10) +#define HUGETLB_SIZE (2 << 20) +#endif + +/* + * >= 128TB is the hint addr value we used to select + * large address space. + */ +#define ADDR_SWITCH_HINT (1UL << 47) +#define LOW_ADDR ((void *) (1UL << 30)) +#define HIGH_ADDR ((void *) (1UL << 48)) + +struct testcase { + void *addr; + unsigned long size; + unsigned long flags; + const char *msg; + unsigned int low_addr_required:1; + unsigned int keep_mapped:1; +}; + +static struct testcase testcases[] = { + { + /* + * If stack is moved, we could possibly allocate + * this at the requested address. + */ + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", + .low_addr_required = 1, + }, + { + /* + * We should never allocate at the requested address or above it + * The len cross the 128TB boundary. Without MAP_FIXED + * we will always search in the lower address space. + */ + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, (2 * PAGE_SIZE))", + .low_addr_required = 1, + }, + { + /* + * Exact mapping at 128TB, the area is free we should get that + * even without MAP_FIXED. + */ + .addr = ((void *)(ADDR_SWITCH_HINT)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", + }, + { + .addr = NULL, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(NULL)", + .low_addr_required = 1, + }, + { + .addr = LOW_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(LOW_ADDR)", + .low_addr_required = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR)", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR) again", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(HIGH_ADDR, MAP_FIXED)", + }, + { + .addr = (void *) -1, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1)", + .keep_mapped = 1, + }, + { + .addr = (void *) -1, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1) again", + }, + { + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", + .low_addr_required = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2 * PAGE_SIZE)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE / 2), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE/2 , 2 * PAGE_SIZE)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = ((void *)(ADDR_SWITCH_HINT)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", + }, +}; + +static struct testcase hugetlb_testcases[] = { + { + .addr = NULL, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(NULL, MAP_HUGETLB)", + .low_addr_required = 1, + }, + { + .addr = LOW_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(LOW_ADDR, MAP_HUGETLB)", + .low_addr_required = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)", + }, + { + .addr = (void *) -1, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1, MAP_HUGETLB)", + .keep_mapped = 1, + }, + { + .addr = (void *) -1, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1, MAP_HUGETLB) again", + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), + .size = 2 * HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2*HUGETLB_SIZE, MAP_HUGETLB)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT , 2*HUGETLB_SIZE, MAP_FIXED | MAP_HUGETLB)", + }, +}; + +static int run_test(struct testcase *test, int count) +{ + void *p; + int i, ret = KSFT_PASS; + + for (i = 0; i < count; i++) { + struct testcase *t = test + i; + + p = mmap(t->addr, t->size, PROT_READ | PROT_WRITE, t->flags, -1, 0); + + printf("%s: %p - ", t->msg, p); + + if (p == MAP_FAILED) { + printf("FAILED\n"); + ret = KSFT_FAIL; + continue; + } + + if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) { + printf("FAILED\n"); + ret = KSFT_FAIL; + } else { + /* + * Do a dereference of the address returned so that we catch + * bugs in page fault handling + */ + memset(p, 0, t->size); + printf("OK\n"); + } + if (!t->keep_mapped) + munmap(p, t->size); + } + + return ret; +} + +static int supported_arch(void) +{ +#if defined(__powerpc64__) + return 1; +#elif defined(__x86_64__) + return 1; +#else + return 0; +#endif +} + +int main(int argc, char **argv) +{ + int ret; + + if (!supported_arch()) + return KSFT_SKIP; + + ret = run_test(testcases, ARRAY_SIZE(testcases)); + if (argc == 2 && !strcmp(argv[1], "--run-hugetlb")) + ret = run_test(hugetlb_testcases, ARRAY_SIZE(hugetlb_testcases)); + return ret; +} --- /dev/null +++ a/tools/testing/selftests/mm/va_128TBswitch.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (C) 2022 Adam Sindelar (Meta) <adam@xxxxxxxxxxxx> +# +# This is a test for mmap behavior with 5-level paging. This script wraps the +# real test to check that the kernel is configured to support at least 5 +# pagetable levels. + +# 1 means the test failed +exitcode=1 + +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +fail() +{ + echo "$1" + exit $exitcode +} + +check_supported_x86_64() +{ + local config="/proc/config.gz" + [[ -f "${config}" ]] || config="/boot/config-$(uname -r)" + [[ -f "${config}" ]] || fail "Cannot find kernel config in /proc or /boot" + + # gzip -dcfq automatically handles both compressed and plaintext input. + # See man 1 gzip under '-f'. + local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2) + + if [[ "${pg_table_levels}" -lt 5 ]]; then + echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test" + exit $ksft_skip + fi +} + +check_test_requirements() +{ + # The test supports x86_64 and powerpc64. We currently have no useful + # eligibility check for powerpc64, and the test itself will reject other + # architectures. + case `uname -m` in + "x86_64") + check_supported_x86_64 + ;; + *) + return 0 + ;; + esac +} + +check_test_requirements +./va_128TBswitch --- /dev/null +++ a/tools/testing/selftests/mm/virtual_address_range.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright 2017, Anshuman Khandual, IBM Corp. + * + * Works on architectures which support 128TB virtual + * address range and beyond. + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <sys/mman.h> +#include <sys/time.h> + +/* + * Maximum address range mapped with a single mmap() + * call is little bit more than 16GB. Hence 16GB is + * chosen as the single chunk size for address space + * mapping. + */ +#define MAP_CHUNK_SIZE 17179869184UL /* 16GB */ + +/* + * Address space till 128TB is mapped without any hint + * and is enabled by default. Address space beyond 128TB + * till 512TB is obtained by passing hint address as the + * first argument into mmap() system call. + * + * The process heap address space is divided into two + * different areas one below 128TB and one above 128TB + * till it reaches 512TB. One with size 128TB and the + * other being 384TB. + * + * On Arm64 the address space is 256TB and no high mappings + * are supported so far. + */ + +#define NR_CHUNKS_128TB 8192UL /* Number of 16GB chunks for 128TB */ +#define NR_CHUNKS_256TB (NR_CHUNKS_128TB * 2UL) +#define NR_CHUNKS_384TB (NR_CHUNKS_128TB * 3UL) + +#define ADDR_MARK_128TB (1UL << 47) /* First address beyond 128TB */ +#define ADDR_MARK_256TB (1UL << 48) /* First address beyond 256TB */ + +#ifdef __aarch64__ +#define HIGH_ADDR_MARK ADDR_MARK_256TB +#define HIGH_ADDR_SHIFT 49 +#define NR_CHUNKS_LOW NR_CHUNKS_256TB +#define NR_CHUNKS_HIGH 0 +#else +#define HIGH_ADDR_MARK ADDR_MARK_128TB +#define HIGH_ADDR_SHIFT 48 +#define NR_CHUNKS_LOW NR_CHUNKS_128TB +#define NR_CHUNKS_HIGH NR_CHUNKS_384TB +#endif + +static char *hind_addr(void) +{ + int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT); + + return (char *) (1UL << bits); +} + +static int validate_addr(char *ptr, int high_addr) +{ + unsigned long addr = (unsigned long) ptr; + + if (high_addr) { + if (addr < HIGH_ADDR_MARK) { + printf("Bad address %lx\n", addr); + return 1; + } + return 0; + } + + if (addr > HIGH_ADDR_MARK) { + printf("Bad address %lx\n", addr); + return 1; + } + return 0; +} + +static int validate_lower_address_hint(void) +{ + char *ptr; + + ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ | + PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (ptr == MAP_FAILED) + return 0; + + return 1; +} + +int main(int argc, char *argv[]) +{ + char *ptr[NR_CHUNKS_LOW]; + char *hptr[NR_CHUNKS_HIGH]; + char *hint; + unsigned long i, lchunks, hchunks; + + for (i = 0; i < NR_CHUNKS_LOW; i++) { + ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (ptr[i] == MAP_FAILED) { + if (validate_lower_address_hint()) + return 1; + break; + } + + if (validate_addr(ptr[i], 0)) + return 1; + } + lchunks = i; + + for (i = 0; i < NR_CHUNKS_HIGH; i++) { + hint = hind_addr(); + hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (hptr[i] == MAP_FAILED) + break; + + if (validate_addr(hptr[i], 1)) + return 1; + } + hchunks = i; + + for (i = 0; i < lchunks; i++) + munmap(ptr[i], MAP_CHUNK_SIZE); + + for (i = 0; i < hchunks; i++) + munmap(hptr[i], MAP_CHUNK_SIZE); + + return 0; +} --- /dev/null +++ a/tools/testing/selftests/mm/vm_util.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <string.h> +#include <fcntl.h> +#include "../kselftest.h" +#include "vm_util.h" + +#define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" +#define SMAP_FILE_PATH "/proc/self/smaps" +#define MAX_LINE_LENGTH 500 + +uint64_t pagemap_get_entry(int fd, char *start) +{ + const unsigned long pfn = (unsigned long)start / getpagesize(); + uint64_t entry; + int ret; + + ret = pread(fd, &entry, sizeof(entry), pfn * sizeof(entry)); + if (ret != sizeof(entry)) + ksft_exit_fail_msg("reading pagemap failed\n"); + return entry; +} + +bool pagemap_is_softdirty(int fd, char *start) +{ + uint64_t entry = pagemap_get_entry(fd, start); + + // Check if dirty bit (55th bit) is set + return entry & 0x0080000000000000ull; +} + +bool pagemap_is_swapped(int fd, char *start) +{ + uint64_t entry = pagemap_get_entry(fd, start); + + return entry & 0x4000000000000000ull; +} + +bool pagemap_is_populated(int fd, char *start) +{ + uint64_t entry = pagemap_get_entry(fd, start); + + /* Present or swapped. */ + return entry & 0xc000000000000000ull; +} + +unsigned long pagemap_get_pfn(int fd, char *start) +{ + uint64_t entry = pagemap_get_entry(fd, start); + + /* If present (63th bit), PFN is at bit 0 -- 54. */ + if (entry & 0x8000000000000000ull) + return entry & 0x007fffffffffffffull; + return -1ul; +} + +void clear_softdirty(void) +{ + int ret; + const char *ctrl = "4"; + int fd = open("/proc/self/clear_refs", O_WRONLY); + + if (fd < 0) + ksft_exit_fail_msg("opening clear_refs failed\n"); + ret = write(fd, ctrl, strlen(ctrl)); + close(fd); + if (ret != strlen(ctrl)) + ksft_exit_fail_msg("writing clear_refs failed\n"); +} + +bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len) +{ + while (fgets(buf, len, fp)) { + if (!strncmp(buf, pattern, strlen(pattern))) + return true; + } + return false; +} + +uint64_t read_pmd_pagesize(void) +{ + int fd; + char buf[20]; + ssize_t num_read; + + fd = open(PMD_SIZE_FILE_PATH, O_RDONLY); + if (fd == -1) + ksft_exit_fail_msg("Open hpage_pmd_size failed\n"); + + num_read = read(fd, buf, 19); + if (num_read < 1) { + close(fd); + ksft_exit_fail_msg("Read hpage_pmd_size failed\n"); + } + buf[num_read] = '\0'; + close(fd); + + return strtoul(buf, NULL, 10); +} + +bool __check_huge(void *addr, char *pattern, int nr_hpages, + uint64_t hpage_size) +{ + uint64_t thp = -1; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + char addr_pattern[MAX_LINE_LENGTH]; + + ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", + (unsigned long) addr); + if (ret >= MAX_LINE_LENGTH) + ksft_exit_fail_msg("%s: Pattern is too long\n", __func__); + + fp = fopen(SMAP_FILE_PATH, "r"); + if (!fp) + ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, SMAP_FILE_PATH); + + if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) + goto err_out; + + /* + * Fetch the pattern in the same block and check the number of + * hugepages. + */ + if (!check_for_pattern(fp, pattern, buffer, sizeof(buffer))) + goto err_out; + + snprintf(addr_pattern, MAX_LINE_LENGTH, "%s%%9ld kB", pattern); + + if (sscanf(buffer, addr_pattern, &thp) != 1) + ksft_exit_fail_msg("Reading smap error\n"); + +err_out: + fclose(fp); + return thp == (nr_hpages * (hpage_size >> 10)); +} + +bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "AnonHugePages: ", nr_hpages, hpage_size); +} + +bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "FilePmdMapped:", nr_hpages, hpage_size); +} + +bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "ShmemPmdMapped:", nr_hpages, hpage_size); +} --- /dev/null +++ a/tools/testing/selftests/mm/vm_util.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <stdint.h> +#include <stdbool.h> + +uint64_t pagemap_get_entry(int fd, char *start); +bool pagemap_is_softdirty(int fd, char *start); +bool pagemap_is_swapped(int fd, char *start); +bool pagemap_is_populated(int fd, char *start); +unsigned long pagemap_get_pfn(int fd, char *start); +void clear_softdirty(void); +bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); +uint64_t read_pmd_pagesize(void); +bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); +bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size); +bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size); --- /dev/null +++ a/tools/testing/selftests/mm/write_hugetlb_memory.sh @@ -0,0 +1,23 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +set -e + +size=$1 +populate=$2 +write=$3 +cgroup=$4 +path=$5 +method=$6 +private=$7 +want_sleep=$8 +reserve=$9 + +echo "Putting task in cgroup '$cgroup'" +echo $$ > ${cgroup_path:-/dev/cgroup/memory}/"$cgroup"/cgroup.procs + +echo "Method is $method" + +set +e +./write_to_hugetlbfs -p "$path" -s "$size" "$write" "$populate" -m "$method" \ + "$private" "$want_sleep" "$reserve" --- /dev/null +++ a/tools/testing/selftests/mm/write_to_hugetlbfs.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * This program reserves and uses hugetlb memory, supporting a bunch of + * scenarios needed by the charged_reserved_hugetlb.sh test. + */ + +#include <err.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/shm.h> +#include <sys/stat.h> +#include <sys/mman.h> + +/* Global definitions. */ +enum method { + HUGETLBFS, + MMAP_MAP_HUGETLB, + SHM, + MAX_METHOD +}; + + +/* Global variables. */ +static const char *self; +static char *shmaddr; +static int shmid; + +/* + * Show usage and exit. + */ +static void exit_usage(void) +{ + printf("Usage: %s -p <path to hugetlbfs file> -s <size to map> " + "[-m <0=hugetlbfs | 1=mmap(MAP_HUGETLB)>] [-l] [-r] " + "[-o] [-w] [-n]\n", + self); + exit(EXIT_FAILURE); +} + +void sig_handler(int signo) +{ + printf("Received %d.\n", signo); + if (signo == SIGINT) { + printf("Deleting the memory\n"); + if (shmdt((const void *)shmaddr) != 0) { + perror("Detach failure"); + shmctl(shmid, IPC_RMID, NULL); + exit(4); + } + + shmctl(shmid, IPC_RMID, NULL); + printf("Done deleting the memory\n"); + } + exit(2); +} + +int main(int argc, char **argv) +{ + int fd = 0; + int key = 0; + int *ptr = NULL; + int c = 0; + int size = 0; + char path[256] = ""; + enum method method = MAX_METHOD; + int want_sleep = 0, private = 0; + int populate = 0; + int write = 0; + int reserve = 1; + + if (signal(SIGINT, sig_handler) == SIG_ERR) + err(1, "\ncan't catch SIGINT\n"); + + /* Parse command-line arguments. */ + setvbuf(stdout, NULL, _IONBF, 0); + self = argv[0]; + + while ((c = getopt(argc, argv, "s:p:m:owlrn")) != -1) { + switch (c) { + case 's': + size = atoi(optarg); + break; + case 'p': + strncpy(path, optarg, sizeof(path)); + break; + case 'm': + if (atoi(optarg) >= MAX_METHOD) { + errno = EINVAL; + perror("Invalid -m."); + exit_usage(); + } + method = atoi(optarg); + break; + case 'o': + populate = 1; + break; + case 'w': + write = 1; + break; + case 'l': + want_sleep = 1; + break; + case 'r': + private + = 1; + break; + case 'n': + reserve = 0; + break; + default: + errno = EINVAL; + perror("Invalid arg"); + exit_usage(); + } + } + + if (strncmp(path, "", sizeof(path)) != 0) { + printf("Writing to this path: %s\n", path); + } else { + errno = EINVAL; + perror("path not found"); + exit_usage(); + } + + if (size != 0) { + printf("Writing this size: %d\n", size); + } else { + errno = EINVAL; + perror("size not found"); + exit_usage(); + } + + if (!populate) + printf("Not populating.\n"); + else + printf("Populating.\n"); + + if (!write) + printf("Not writing to memory.\n"); + + if (method == MAX_METHOD) { + errno = EINVAL; + perror("-m Invalid"); + exit_usage(); + } else + printf("Using method=%d\n", method); + + if (!private) + printf("Shared mapping.\n"); + else + printf("Private mapping.\n"); + + if (!reserve) + printf("NO_RESERVE mapping.\n"); + else + printf("RESERVE mapping.\n"); + + switch (method) { + case HUGETLBFS: + printf("Allocating using HUGETLBFS.\n"); + fd = open(path, O_CREAT | O_RDWR, 0777); + if (fd == -1) + err(1, "Failed to open file."); + + ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + (private ? MAP_PRIVATE : MAP_SHARED) | + (populate ? MAP_POPULATE : 0) | + (reserve ? 0 : MAP_NORESERVE), + fd, 0); + + if (ptr == MAP_FAILED) { + close(fd); + err(1, "Error mapping the file"); + } + break; + case MMAP_MAP_HUGETLB: + printf("Allocating using MAP_HUGETLB.\n"); + ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, + (private ? (MAP_PRIVATE | MAP_ANONYMOUS) : + MAP_SHARED) | + MAP_HUGETLB | (populate ? MAP_POPULATE : 0) | + (reserve ? 0 : MAP_NORESERVE), + -1, 0); + + if (ptr == MAP_FAILED) + err(1, "mmap"); + + printf("Returned address is %p\n", ptr); + break; + case SHM: + printf("Allocating using SHM.\n"); + shmid = shmget(key, size, + SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); + if (shmid < 0) { + shmid = shmget(++key, size, + SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); + if (shmid < 0) + err(1, "shmget"); + } + printf("shmid: 0x%x, shmget key:%d\n", shmid, key); + + ptr = shmat(shmid, NULL, 0); + if (ptr == (int *)-1) { + perror("Shared memory attach failure"); + shmctl(shmid, IPC_RMID, NULL); + exit(2); + } + printf("shmaddr: %p\n", ptr); + + break; + default: + errno = EINVAL; + err(1, "Invalid method."); + } + + if (write) { + printf("Writing to memory.\n"); + memset(ptr, 1, size); + } + + if (want_sleep) { + /* Signal to caller that we're done. */ + printf("DONE\n"); + + /* Hold memory until external kill signal is delivered. */ + while (1) + sleep(100); + } + + if (method == HUGETLBFS) + close(fd); + + return 0; +} --- a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh +++ /dev/null @@ -1,584 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -set -e - -if [[ $(id -u) -ne 0 ]]; then - echo "This test must be run as root. Skipping..." - exit $ksft_skip -fi - -fault_limit_file=limit_in_bytes -reservation_limit_file=rsvd.limit_in_bytes -fault_usage_file=usage_in_bytes -reservation_usage_file=rsvd.usage_in_bytes - -if [[ "$1" == "-cgroup-v2" ]]; then - cgroup2=1 - fault_limit_file=max - reservation_limit_file=rsvd.max - fault_usage_file=current - reservation_usage_file=rsvd.current -fi - -if [[ $cgroup2 ]]; then - cgroup_path=$(mount -t cgroup2 | head -1 | awk -e '{print $3}') - if [[ -z "$cgroup_path" ]]; then - cgroup_path=/dev/cgroup/memory - mount -t cgroup2 none $cgroup_path - do_umount=1 - fi - echo "+hugetlb" >$cgroup_path/cgroup.subtree_control -else - cgroup_path=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}') - if [[ -z "$cgroup_path" ]]; then - cgroup_path=/dev/cgroup/memory - mount -t cgroup memory,hugetlb $cgroup_path - do_umount=1 - fi -fi -export cgroup_path - -function cleanup() { - if [[ $cgroup2 ]]; then - echo $$ >$cgroup_path/cgroup.procs - else - echo $$ >$cgroup_path/tasks - fi - - if [[ -e /mnt/huge ]]; then - rm -rf /mnt/huge/* - umount /mnt/huge || echo error - rmdir /mnt/huge - fi - if [[ -e $cgroup_path/hugetlb_cgroup_test ]]; then - rmdir $cgroup_path/hugetlb_cgroup_test - fi - if [[ -e $cgroup_path/hugetlb_cgroup_test1 ]]; then - rmdir $cgroup_path/hugetlb_cgroup_test1 - fi - if [[ -e $cgroup_path/hugetlb_cgroup_test2 ]]; then - rmdir $cgroup_path/hugetlb_cgroup_test2 - fi - echo 0 >/proc/sys/vm/nr_hugepages - echo CLEANUP DONE -} - -function expect_equal() { - local expected="$1" - local actual="$2" - local error="$3" - - if [[ "$expected" != "$actual" ]]; then - echo "expected ($expected) != actual ($actual): $3" - cleanup - exit 1 - fi -} - -function get_machine_hugepage_size() { - hpz=$(grep -i hugepagesize /proc/meminfo) - kb=${hpz:14:-3} - mb=$(($kb / 1024)) - echo $mb -} - -MB=$(get_machine_hugepage_size) - -function setup_cgroup() { - local name="$1" - local cgroup_limit="$2" - local reservation_limit="$3" - - mkdir $cgroup_path/$name - - echo writing cgroup limit: "$cgroup_limit" - echo "$cgroup_limit" >$cgroup_path/$name/hugetlb.${MB}MB.$fault_limit_file - - echo writing reseravation limit: "$reservation_limit" - echo "$reservation_limit" > \ - $cgroup_path/$name/hugetlb.${MB}MB.$reservation_limit_file - - if [ -e "$cgroup_path/$name/cpuset.cpus" ]; then - echo 0 >$cgroup_path/$name/cpuset.cpus - fi - if [ -e "$cgroup_path/$name/cpuset.mems" ]; then - echo 0 >$cgroup_path/$name/cpuset.mems - fi -} - -function wait_for_hugetlb_memory_to_get_depleted() { - local cgroup="$1" - local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" - # Wait for hugetlbfs memory to get depleted. - while [ $(cat $path) != 0 ]; do - echo Waiting for hugetlb memory to get depleted. - cat $path - sleep 0.5 - done -} - -function wait_for_hugetlb_memory_to_get_reserved() { - local cgroup="$1" - local size="$2" - - local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" - # Wait for hugetlbfs memory to get written. - while [ $(cat $path) != $size ]; do - echo Waiting for hugetlb memory reservation to reach size $size. - cat $path - sleep 0.5 - done -} - -function wait_for_hugetlb_memory_to_get_written() { - local cgroup="$1" - local size="$2" - - local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file" - # Wait for hugetlbfs memory to get written. - while [ $(cat $path) != $size ]; do - echo Waiting for hugetlb memory to reach size $size. - cat $path - sleep 0.5 - done -} - -function write_hugetlbfs_and_get_usage() { - local cgroup="$1" - local size="$2" - local populate="$3" - local write="$4" - local path="$5" - local method="$6" - local private="$7" - local expect_failure="$8" - local reserve="$9" - - # Function return values. - reservation_failed=0 - oom_killed=0 - hugetlb_difference=0 - reserved_difference=0 - - local hugetlb_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file - local reserved_usage=$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file - - local hugetlb_before=$(cat $hugetlb_usage) - local reserved_before=$(cat $reserved_usage) - - echo - echo Starting: - echo hugetlb_usage="$hugetlb_before" - echo reserved_usage="$reserved_before" - echo expect_failure is "$expect_failure" - - output=$(mktemp) - set +e - if [[ "$method" == "1" ]] || [[ "$method" == 2 ]] || - [[ "$private" == "-r" ]] && [[ "$expect_failure" != 1 ]]; then - - bash write_hugetlb_memory.sh "$size" "$populate" "$write" \ - "$cgroup" "$path" "$method" "$private" "-l" "$reserve" 2>&1 | tee $output & - - local write_result=$? - local write_pid=$! - - until grep -q -i "DONE" $output; do - echo waiting for DONE signal. - if ! ps $write_pid > /dev/null - then - echo "FAIL: The write died" - cleanup - exit 1 - fi - sleep 0.5 - done - - echo ================= write_hugetlb_memory.sh output is: - cat $output - echo ================= end output. - - if [[ "$populate" == "-o" ]] || [[ "$write" == "-w" ]]; then - wait_for_hugetlb_memory_to_get_written "$cgroup" "$size" - elif [[ "$reserve" != "-n" ]]; then - wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size" - else - # This case doesn't produce visible effects, but we still have - # to wait for the async process to start and execute... - sleep 0.5 - fi - - echo write_result is $write_result - else - bash write_hugetlb_memory.sh "$size" "$populate" "$write" \ - "$cgroup" "$path" "$method" "$private" "$reserve" - local write_result=$? - - if [[ "$reserve" != "-n" ]]; then - wait_for_hugetlb_memory_to_get_reserved "$cgroup" "$size" - fi - fi - set -e - - if [[ "$write_result" == 1 ]]; then - reservation_failed=1 - fi - - # On linus/master, the above process gets SIGBUS'd on oomkill, with - # return code 135. On earlier kernels, it gets actual oomkill, with return - # code 137, so just check for both conditions in case we're testing - # against an earlier kernel. - if [[ "$write_result" == 135 ]] || [[ "$write_result" == 137 ]]; then - oom_killed=1 - fi - - local hugetlb_after=$(cat $hugetlb_usage) - local reserved_after=$(cat $reserved_usage) - - echo After write: - echo hugetlb_usage="$hugetlb_after" - echo reserved_usage="$reserved_after" - - hugetlb_difference=$(($hugetlb_after - $hugetlb_before)) - reserved_difference=$(($reserved_after - $reserved_before)) -} - -function cleanup_hugetlb_memory() { - set +e - local cgroup="$1" - if [[ "$(pgrep -f write_to_hugetlbfs)" != "" ]]; then - echo killing write_to_hugetlbfs - killall -2 write_to_hugetlbfs - wait_for_hugetlb_memory_to_get_depleted $cgroup - fi - set -e - - if [[ -e /mnt/huge ]]; then - rm -rf /mnt/huge/* - umount /mnt/huge - rmdir /mnt/huge - fi -} - -function run_test() { - local size=$(($1 * ${MB} * 1024 * 1024)) - local populate="$2" - local write="$3" - local cgroup_limit=$(($4 * ${MB} * 1024 * 1024)) - local reservation_limit=$(($5 * ${MB} * 1024 * 1024)) - local nr_hugepages="$6" - local method="$7" - local private="$8" - local expect_failure="$9" - local reserve="${10}" - - # Function return values. - hugetlb_difference=0 - reserved_difference=0 - reservation_failed=0 - oom_killed=0 - - echo nr hugepages = "$nr_hugepages" - echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages - - setup_cgroup "hugetlb_cgroup_test" "$cgroup_limit" "$reservation_limit" - - mkdir -p /mnt/huge - mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge - - write_hugetlbfs_and_get_usage "hugetlb_cgroup_test" "$size" "$populate" \ - "$write" "/mnt/huge/test" "$method" "$private" "$expect_failure" \ - "$reserve" - - cleanup_hugetlb_memory "hugetlb_cgroup_test" - - local final_hugetlb=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$fault_usage_file) - local final_reservation=$(cat $cgroup_path/hugetlb_cgroup_test/hugetlb.${MB}MB.$reservation_usage_file) - - echo $hugetlb_difference - echo $reserved_difference - expect_equal "0" "$final_hugetlb" "final hugetlb is not zero" - expect_equal "0" "$final_reservation" "final reservation is not zero" -} - -function run_multiple_cgroup_test() { - local size1="$1" - local populate1="$2" - local write1="$3" - local cgroup_limit1="$4" - local reservation_limit1="$5" - - local size2="$6" - local populate2="$7" - local write2="$8" - local cgroup_limit2="$9" - local reservation_limit2="${10}" - - local nr_hugepages="${11}" - local method="${12}" - local private="${13}" - local expect_failure="${14}" - local reserve="${15}" - - # Function return values. - hugetlb_difference1=0 - reserved_difference1=0 - reservation_failed1=0 - oom_killed1=0 - - hugetlb_difference2=0 - reserved_difference2=0 - reservation_failed2=0 - oom_killed2=0 - - echo nr hugepages = "$nr_hugepages" - echo "$nr_hugepages" >/proc/sys/vm/nr_hugepages - - setup_cgroup "hugetlb_cgroup_test1" "$cgroup_limit1" "$reservation_limit1" - setup_cgroup "hugetlb_cgroup_test2" "$cgroup_limit2" "$reservation_limit2" - - mkdir -p /mnt/huge - mount -t hugetlbfs -o pagesize=${MB}M,size=256M none /mnt/huge - - write_hugetlbfs_and_get_usage "hugetlb_cgroup_test1" "$size1" \ - "$populate1" "$write1" "/mnt/huge/test1" "$method" "$private" \ - "$expect_failure" "$reserve" - - hugetlb_difference1=$hugetlb_difference - reserved_difference1=$reserved_difference - reservation_failed1=$reservation_failed - oom_killed1=$oom_killed - - local cgroup1_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$fault_usage_file - local cgroup1_reservation_usage=$cgroup_path/hugetlb_cgroup_test1/hugetlb.${MB}MB.$reservation_usage_file - local cgroup2_hugetlb_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$fault_usage_file - local cgroup2_reservation_usage=$cgroup_path/hugetlb_cgroup_test2/hugetlb.${MB}MB.$reservation_usage_file - - local usage_before_second_write=$(cat $cgroup1_hugetlb_usage) - local reservation_usage_before_second_write=$(cat $cgroup1_reservation_usage) - - write_hugetlbfs_and_get_usage "hugetlb_cgroup_test2" "$size2" \ - "$populate2" "$write2" "/mnt/huge/test2" "$method" "$private" \ - "$expect_failure" "$reserve" - - hugetlb_difference2=$hugetlb_difference - reserved_difference2=$reserved_difference - reservation_failed2=$reservation_failed - oom_killed2=$oom_killed - - expect_equal "$usage_before_second_write" \ - "$(cat $cgroup1_hugetlb_usage)" "Usage changed." - expect_equal "$reservation_usage_before_second_write" \ - "$(cat $cgroup1_reservation_usage)" "Reservation usage changed." - - cleanup_hugetlb_memory - - local final_hugetlb=$(cat $cgroup1_hugetlb_usage) - local final_reservation=$(cat $cgroup1_reservation_usage) - - expect_equal "0" "$final_hugetlb" \ - "hugetlbt_cgroup_test1 final hugetlb is not zero" - expect_equal "0" "$final_reservation" \ - "hugetlbt_cgroup_test1 final reservation is not zero" - - local final_hugetlb=$(cat $cgroup2_hugetlb_usage) - local final_reservation=$(cat $cgroup2_reservation_usage) - - expect_equal "0" "$final_hugetlb" \ - "hugetlb_cgroup_test2 final hugetlb is not zero" - expect_equal "0" "$final_reservation" \ - "hugetlb_cgroup_test2 final reservation is not zero" -} - -cleanup - -for populate in "" "-o"; do - for method in 0 1 2; do - for private in "" "-r"; do - for reserve in "" "-n"; do - - # Skip mmap(MAP_HUGETLB | MAP_SHARED). Doesn't seem to be supported. - if [[ "$method" == 1 ]] && [[ "$private" == "" ]]; then - continue - fi - - # Skip populated shmem tests. Doesn't seem to be supported. - if [[ "$method" == 2"" ]] && [[ "$populate" == "-o" ]]; then - continue - fi - - if [[ "$method" == 2"" ]] && [[ "$reserve" == "-n" ]]; then - continue - fi - - cleanup - echo - echo - echo - echo Test normal case. - echo private=$private, populate=$populate, method=$method, reserve=$reserve - run_test 5 "$populate" "" 10 10 10 "$method" "$private" "0" "$reserve" - - echo Memory charged to hugtlb=$hugetlb_difference - echo Memory charged to reservation=$reserved_difference - - if [[ "$populate" == "-o" ]]; then - expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \ - "Reserved memory charged to hugetlb cgroup." - else - expect_equal "0" "$hugetlb_difference" \ - "Reserved memory charged to hugetlb cgroup." - fi - - if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then - expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \ - "Reserved memory not charged to reservation usage." - else - expect_equal "0" "$reserved_difference" \ - "Reserved memory not charged to reservation usage." - fi - - echo 'PASS' - - cleanup - echo - echo - echo - echo Test normal case with write. - echo private=$private, populate=$populate, method=$method, reserve=$reserve - run_test 5 "$populate" '-w' 5 5 10 "$method" "$private" "0" "$reserve" - - echo Memory charged to hugtlb=$hugetlb_difference - echo Memory charged to reservation=$reserved_difference - - expect_equal "$((5 * $MB * 1024 * 1024))" "$hugetlb_difference" \ - "Reserved memory charged to hugetlb cgroup." - - expect_equal "$((5 * $MB * 1024 * 1024))" "$reserved_difference" \ - "Reserved memory not charged to reservation usage." - - echo 'PASS' - - cleanup - continue - echo - echo - echo - echo Test more than reservation case. - echo private=$private, populate=$populate, method=$method, reserve=$reserve - - if [ "$reserve" != "-n" ]; then - run_test "5" "$populate" '' "10" "2" "10" "$method" "$private" "1" \ - "$reserve" - - expect_equal "1" "$reservation_failed" "Reservation succeeded." - fi - - echo 'PASS' - - cleanup - - echo - echo - echo - echo Test more than cgroup limit case. - echo private=$private, populate=$populate, method=$method, reserve=$reserve - - # Not sure if shm memory can be cleaned up when the process gets sigbus'd. - if [[ "$method" != 2 ]]; then - run_test 5 "$populate" "-w" 2 10 10 "$method" "$private" "1" "$reserve" - - expect_equal "1" "$oom_killed" "Not oom killed." - fi - echo 'PASS' - - cleanup - - echo - echo - echo - echo Test normal case, multiple cgroups. - echo private=$private, populate=$populate, method=$method, reserve=$reserve - run_multiple_cgroup_test "3" "$populate" "" "10" "10" "5" \ - "$populate" "" "10" "10" "10" \ - "$method" "$private" "0" "$reserve" - - echo Memory charged to hugtlb1=$hugetlb_difference1 - echo Memory charged to reservation1=$reserved_difference1 - echo Memory charged to hugtlb2=$hugetlb_difference2 - echo Memory charged to reservation2=$reserved_difference2 - - if [[ "$reserve" != "-n" ]] || [[ "$populate" == "-o" ]]; then - expect_equal "3" "$reserved_difference1" \ - "Incorrect reservations charged to cgroup 1." - - expect_equal "5" "$reserved_difference2" \ - "Incorrect reservation charged to cgroup 2." - - else - expect_equal "0" "$reserved_difference1" \ - "Incorrect reservations charged to cgroup 1." - - expect_equal "0" "$reserved_difference2" \ - "Incorrect reservation charged to cgroup 2." - fi - - if [[ "$populate" == "-o" ]]; then - expect_equal "3" "$hugetlb_difference1" \ - "Incorrect hugetlb charged to cgroup 1." - - expect_equal "5" "$hugetlb_difference2" \ - "Incorrect hugetlb charged to cgroup 2." - - else - expect_equal "0" "$hugetlb_difference1" \ - "Incorrect hugetlb charged to cgroup 1." - - expect_equal "0" "$hugetlb_difference2" \ - "Incorrect hugetlb charged to cgroup 2." - fi - echo 'PASS' - - cleanup - echo - echo - echo - echo Test normal case with write, multiple cgroups. - echo private=$private, populate=$populate, method=$method, reserve=$reserve - run_multiple_cgroup_test "3" "$populate" "-w" "10" "10" "5" \ - "$populate" "-w" "10" "10" "10" \ - "$method" "$private" "0" "$reserve" - - echo Memory charged to hugtlb1=$hugetlb_difference1 - echo Memory charged to reservation1=$reserved_difference1 - echo Memory charged to hugtlb2=$hugetlb_difference2 - echo Memory charged to reservation2=$reserved_difference2 - - expect_equal "3" "$hugetlb_difference1" \ - "Incorrect hugetlb charged to cgroup 1." - - expect_equal "3" "$reserved_difference1" \ - "Incorrect reservation charged to cgroup 1." - - expect_equal "5" "$hugetlb_difference2" \ - "Incorrect hugetlb charged to cgroup 2." - - expect_equal "5" "$reserved_difference2" \ - "Incorrected reservation charged to cgroup 2." - echo 'PASS' - - cleanup - - done # reserve - done # private - done # populate -done # method - -if [[ $do_umount ]]; then - umount $cgroup_path - rmdir $cgroup_path -fi --- a/tools/testing/selftests/vm/check_config.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 -# -# Probe for libraries and create header files to record the results. Both C -# header files and Makefile include fragments are created. - -OUTPUT_H_FILE=local_config.h -OUTPUT_MKFILE=local_config.mk - -tmpname=$(mktemp) -tmpfile_c=${tmpname}.c -tmpfile_o=${tmpname}.o - -# liburing -echo "#include <sys/types.h>" > $tmpfile_c -echo "#include <liburing.h>" >> $tmpfile_c -echo "int func(void) { return 0; }" >> $tmpfile_c - -CC=${1:?"Usage: $0 <compiler> # example compiler: gcc"} -$CC -c $tmpfile_c -o $tmpfile_o >/dev/null 2>&1 - -if [ -f $tmpfile_o ]; then - echo "#define LOCAL_CONFIG_HAVE_LIBURING 1" > $OUTPUT_H_FILE - echo "COW_EXTRA_LIBS = -luring" > $OUTPUT_MKFILE -else - echo "// No liburing support found" > $OUTPUT_H_FILE - echo "# No liburing support found, so:" > $OUTPUT_MKFILE - echo "COW_EXTRA_LIBS = " >> $OUTPUT_MKFILE -fi - -rm ${tmpname}.* --- a/tools/testing/selftests/vm/compaction_test.c +++ /dev/null @@ -1,231 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * - * A test for the patch "Allow compaction of unevictable pages". - * With this patch we should be able to allocate at least 1/4 - * of RAM in huge pages. Without the patch much less is - * allocated. - */ - -#include <stdio.h> -#include <stdlib.h> -#include <sys/mman.h> -#include <sys/resource.h> -#include <fcntl.h> -#include <errno.h> -#include <unistd.h> -#include <string.h> - -#include "../kselftest.h" - -#define MAP_SIZE_MB 100 -#define MAP_SIZE (MAP_SIZE_MB * 1024 * 1024) - -struct map_list { - void *map; - struct map_list *next; -}; - -int read_memory_info(unsigned long *memfree, unsigned long *hugepagesize) -{ - char buffer[256] = {0}; - char *cmd = "cat /proc/meminfo | grep -i memfree | grep -o '[0-9]*'"; - FILE *cmdfile = popen(cmd, "r"); - - if (!(fgets(buffer, sizeof(buffer), cmdfile))) { - perror("Failed to read meminfo\n"); - return -1; - } - - pclose(cmdfile); - - *memfree = atoll(buffer); - cmd = "cat /proc/meminfo | grep -i hugepagesize | grep -o '[0-9]*'"; - cmdfile = popen(cmd, "r"); - - if (!(fgets(buffer, sizeof(buffer), cmdfile))) { - perror("Failed to read meminfo\n"); - return -1; - } - - pclose(cmdfile); - *hugepagesize = atoll(buffer); - - return 0; -} - -int prereq(void) -{ - char allowed; - int fd; - - fd = open("/proc/sys/vm/compact_unevictable_allowed", - O_RDONLY | O_NONBLOCK); - if (fd < 0) { - perror("Failed to open\n" - "/proc/sys/vm/compact_unevictable_allowed\n"); - return -1; - } - - if (read(fd, &allowed, sizeof(char)) != sizeof(char)) { - perror("Failed to read from\n" - "/proc/sys/vm/compact_unevictable_allowed\n"); - close(fd); - return -1; - } - - close(fd); - if (allowed == '1') - return 0; - - return -1; -} - -int check_compaction(unsigned long mem_free, unsigned int hugepage_size) -{ - int fd; - int compaction_index = 0; - char initial_nr_hugepages[10] = {0}; - char nr_hugepages[10] = {0}; - - /* We want to test with 80% of available memory. Else, OOM killer comes - in to play */ - mem_free = mem_free * 0.8; - - fd = open("/proc/sys/vm/nr_hugepages", O_RDWR | O_NONBLOCK); - if (fd < 0) { - perror("Failed to open /proc/sys/vm/nr_hugepages"); - return -1; - } - - if (read(fd, initial_nr_hugepages, sizeof(initial_nr_hugepages)) <= 0) { - perror("Failed to read from /proc/sys/vm/nr_hugepages"); - goto close_fd; - } - - /* Start with the initial condition of 0 huge pages*/ - if (write(fd, "0", sizeof(char)) != sizeof(char)) { - perror("Failed to write 0 to /proc/sys/vm/nr_hugepages\n"); - goto close_fd; - } - - lseek(fd, 0, SEEK_SET); - - /* Request a large number of huge pages. The Kernel will allocate - as much as it can */ - if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) { - perror("Failed to write 100000 to /proc/sys/vm/nr_hugepages\n"); - goto close_fd; - } - - lseek(fd, 0, SEEK_SET); - - if (read(fd, nr_hugepages, sizeof(nr_hugepages)) <= 0) { - perror("Failed to re-read from /proc/sys/vm/nr_hugepages\n"); - goto close_fd; - } - - /* We should have been able to request at least 1/3 rd of the memory in - huge pages */ - compaction_index = mem_free/(atoi(nr_hugepages) * hugepage_size); - - if (compaction_index > 3) { - printf("No of huge pages allocated = %d\n", - (atoi(nr_hugepages))); - fprintf(stderr, "ERROR: Less that 1/%d of memory is available\n" - "as huge pages\n", compaction_index); - goto close_fd; - } - - printf("No of huge pages allocated = %d\n", - (atoi(nr_hugepages))); - - lseek(fd, 0, SEEK_SET); - - if (write(fd, initial_nr_hugepages, strlen(initial_nr_hugepages)) - != strlen(initial_nr_hugepages)) { - perror("Failed to write value to /proc/sys/vm/nr_hugepages\n"); - goto close_fd; - } - - close(fd); - return 0; - - close_fd: - close(fd); - printf("Not OK. Compaction test failed."); - return -1; -} - - -int main(int argc, char **argv) -{ - struct rlimit lim; - struct map_list *list, *entry; - size_t page_size, i; - void *map = NULL; - unsigned long mem_free = 0; - unsigned long hugepage_size = 0; - long mem_fragmentable_MB = 0; - - if (prereq() != 0) { - printf("Either the sysctl compact_unevictable_allowed is not\n" - "set to 1 or couldn't read the proc file.\n" - "Skipping the test\n"); - return KSFT_SKIP; - } - - lim.rlim_cur = RLIM_INFINITY; - lim.rlim_max = RLIM_INFINITY; - if (setrlimit(RLIMIT_MEMLOCK, &lim)) { - perror("Failed to set rlimit:\n"); - return -1; - } - - page_size = getpagesize(); - - list = NULL; - - if (read_memory_info(&mem_free, &hugepage_size) != 0) { - printf("ERROR: Cannot read meminfo\n"); - return -1; - } - - mem_fragmentable_MB = mem_free * 0.8 / 1024; - - while (mem_fragmentable_MB > 0) { - map = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_LOCKED, -1, 0); - if (map == MAP_FAILED) - break; - - entry = malloc(sizeof(struct map_list)); - if (!entry) { - munmap(map, MAP_SIZE); - break; - } - entry->map = map; - entry->next = list; - list = entry; - - /* Write something (in this case the address of the map) to - * ensure that KSM can't merge the mapped pages - */ - for (i = 0; i < MAP_SIZE; i += page_size) - *(unsigned long *)(map + i) = (unsigned long)map + i; - - mem_fragmentable_MB -= MAP_SIZE_MB; - } - - for (entry = list; entry != NULL; entry = entry->next) { - munmap(entry->map, MAP_SIZE); - if (!entry->next) - break; - entry = entry->next; - } - - if (check_compaction(mem_free, hugepage_size) == 0) - return 0; - - return -1; -} --- a/tools/testing/selftests/vm/config +++ /dev/null @@ -1,8 +0,0 @@ -CONFIG_SYSVIPC=y -CONFIG_USERFAULTFD=y -CONFIG_TEST_VMALLOC=m -CONFIG_DEVICE_PRIVATE=y -CONFIG_TEST_HMM=m -CONFIG_GUP_TEST=y -CONFIG_TRANSPARENT_HUGEPAGE=y -CONFIG_MEM_SOFT_DIRTY=y --- a/tools/testing/selftests/vm/cow.c +++ /dev/null @@ -1,1764 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * COW (Copy On Write) tests. - * - * Copyright 2022, Red Hat, Inc. - * - * Author(s): David Hildenbrand <david@xxxxxxxxxx> - */ -#define _GNU_SOURCE -#include <stdlib.h> -#include <string.h> -#include <stdbool.h> -#include <stdint.h> -#include <unistd.h> -#include <errno.h> -#include <fcntl.h> -#include <dirent.h> -#include <assert.h> -#include <sys/mman.h> -#include <sys/ioctl.h> -#include <sys/wait.h> -#include <linux/memfd.h> - -#include "local_config.h" -#ifdef LOCAL_CONFIG_HAVE_LIBURING -#include <liburing.h> -#endif /* LOCAL_CONFIG_HAVE_LIBURING */ - -#include "../../../../mm/gup_test.h" -#include "../kselftest.h" -#include "vm_util.h" - -#ifndef MADV_COLLAPSE -#define MADV_COLLAPSE 25 -#endif - -static size_t pagesize; -static int pagemap_fd; -static size_t thpsize; -static int nr_hugetlbsizes; -static size_t hugetlbsizes[10]; -static int gup_fd; -static bool has_huge_zeropage; - -static void detect_thpsize(void) -{ - int fd = open("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size", - O_RDONLY); - size_t size = 0; - char buf[15]; - int ret; - - if (fd < 0) - return; - - ret = pread(fd, buf, sizeof(buf), 0); - if (ret > 0 && ret < sizeof(buf)) { - buf[ret] = 0; - - size = strtoul(buf, NULL, 10); - if (size < pagesize) - size = 0; - if (size > 0) { - thpsize = size; - ksft_print_msg("[INFO] detected THP size: %zu KiB\n", - thpsize / 1024); - } - } - - close(fd); -} - -static void detect_huge_zeropage(void) -{ - int fd = open("/sys/kernel/mm/transparent_hugepage/use_zero_page", - O_RDONLY); - size_t enabled = 0; - char buf[15]; - int ret; - - if (fd < 0) - return; - - ret = pread(fd, buf, sizeof(buf), 0); - if (ret > 0 && ret < sizeof(buf)) { - buf[ret] = 0; - - enabled = strtoul(buf, NULL, 10); - if (enabled == 1) { - has_huge_zeropage = true; - ksft_print_msg("[INFO] huge zeropage is enabled\n"); - } - } - - close(fd); -} - -static void detect_hugetlbsizes(void) -{ - DIR *dir = opendir("/sys/kernel/mm/hugepages/"); - - if (!dir) - return; - - while (nr_hugetlbsizes < ARRAY_SIZE(hugetlbsizes)) { - struct dirent *entry = readdir(dir); - size_t kb; - - if (!entry) - break; - if (entry->d_type != DT_DIR) - continue; - if (sscanf(entry->d_name, "hugepages-%zukB", &kb) != 1) - continue; - hugetlbsizes[nr_hugetlbsizes] = kb * 1024; - nr_hugetlbsizes++; - ksft_print_msg("[INFO] detected hugetlb size: %zu KiB\n", - kb); - } - closedir(dir); -} - -static bool range_is_swapped(void *addr, size_t size) -{ - for (; size; addr += pagesize, size -= pagesize) - if (!pagemap_is_swapped(pagemap_fd, addr)) - return false; - return true; -} - -struct comm_pipes { - int child_ready[2]; - int parent_ready[2]; -}; - -static int setup_comm_pipes(struct comm_pipes *comm_pipes) -{ - if (pipe(comm_pipes->child_ready) < 0) - return -errno; - if (pipe(comm_pipes->parent_ready) < 0) { - close(comm_pipes->child_ready[0]); - close(comm_pipes->child_ready[1]); - return -errno; - } - - return 0; -} - -static void close_comm_pipes(struct comm_pipes *comm_pipes) -{ - close(comm_pipes->child_ready[0]); - close(comm_pipes->child_ready[1]); - close(comm_pipes->parent_ready[0]); - close(comm_pipes->parent_ready[1]); -} - -static int child_memcmp_fn(char *mem, size_t size, - struct comm_pipes *comm_pipes) -{ - char *old = malloc(size); - char buf; - - /* Backup the original content. */ - memcpy(old, mem, size); - - /* Wait until the parent modified the page. */ - write(comm_pipes->child_ready[1], "0", 1); - while (read(comm_pipes->parent_ready[0], &buf, 1) != 1) - ; - - /* See if we still read the old values. */ - return memcmp(old, mem, size); -} - -static int child_vmsplice_memcmp_fn(char *mem, size_t size, - struct comm_pipes *comm_pipes) -{ - struct iovec iov = { - .iov_base = mem, - .iov_len = size, - }; - ssize_t cur, total, transferred; - char *old, *new; - int fds[2]; - char buf; - - old = malloc(size); - new = malloc(size); - - /* Backup the original content. */ - memcpy(old, mem, size); - - if (pipe(fds) < 0) - return -errno; - - /* Trigger a read-only pin. */ - transferred = vmsplice(fds[1], &iov, 1, 0); - if (transferred < 0) - return -errno; - if (transferred == 0) - return -EINVAL; - - /* Unmap it from our page tables. */ - if (munmap(mem, size) < 0) - return -errno; - - /* Wait until the parent modified it. */ - write(comm_pipes->child_ready[1], "0", 1); - while (read(comm_pipes->parent_ready[0], &buf, 1) != 1) - ; - - /* See if we still read the old values via the pipe. */ - for (total = 0; total < transferred; total += cur) { - cur = read(fds[0], new + total, transferred - total); - if (cur < 0) - return -errno; - } - - return memcmp(old, new, transferred); -} - -typedef int (*child_fn)(char *mem, size_t size, struct comm_pipes *comm_pipes); - -static void do_test_cow_in_parent(char *mem, size_t size, bool do_mprotect, - child_fn fn) -{ - struct comm_pipes comm_pipes; - char buf; - int ret; - - ret = setup_comm_pipes(&comm_pipes); - if (ret) { - ksft_test_result_fail("pipe() failed\n"); - return; - } - - ret = fork(); - if (ret < 0) { - ksft_test_result_fail("fork() failed\n"); - goto close_comm_pipes; - } else if (!ret) { - exit(fn(mem, size, &comm_pipes)); - } - - while (read(comm_pipes.child_ready[0], &buf, 1) != 1) - ; - - if (do_mprotect) { - /* - * mprotect() optimizations might try avoiding - * write-faults by directly mapping pages writable. - */ - ret = mprotect(mem, size, PROT_READ); - ret |= mprotect(mem, size, PROT_READ|PROT_WRITE); - if (ret) { - ksft_test_result_fail("mprotect() failed\n"); - write(comm_pipes.parent_ready[1], "0", 1); - wait(&ret); - goto close_comm_pipes; - } - } - - /* Modify the page. */ - memset(mem, 0xff, size); - write(comm_pipes.parent_ready[1], "0", 1); - - wait(&ret); - if (WIFEXITED(ret)) - ret = WEXITSTATUS(ret); - else - ret = -EINVAL; - - ksft_test_result(!ret, "No leak from parent into child\n"); -close_comm_pipes: - close_comm_pipes(&comm_pipes); -} - -static void test_cow_in_parent(char *mem, size_t size) -{ - do_test_cow_in_parent(mem, size, false, child_memcmp_fn); -} - -static void test_cow_in_parent_mprotect(char *mem, size_t size) -{ - do_test_cow_in_parent(mem, size, true, child_memcmp_fn); -} - -static void test_vmsplice_in_child(char *mem, size_t size) -{ - do_test_cow_in_parent(mem, size, false, child_vmsplice_memcmp_fn); -} - -static void test_vmsplice_in_child_mprotect(char *mem, size_t size) -{ - do_test_cow_in_parent(mem, size, true, child_vmsplice_memcmp_fn); -} - -static void do_test_vmsplice_in_parent(char *mem, size_t size, - bool before_fork) -{ - struct iovec iov = { - .iov_base = mem, - .iov_len = size, - }; - ssize_t cur, total, transferred; - struct comm_pipes comm_pipes; - char *old, *new; - int ret, fds[2]; - char buf; - - old = malloc(size); - new = malloc(size); - - memcpy(old, mem, size); - - ret = setup_comm_pipes(&comm_pipes); - if (ret) { - ksft_test_result_fail("pipe() failed\n"); - goto free; - } - - if (pipe(fds) < 0) { - ksft_test_result_fail("pipe() failed\n"); - goto close_comm_pipes; - } - - if (before_fork) { - transferred = vmsplice(fds[1], &iov, 1, 0); - if (transferred <= 0) { - ksft_test_result_fail("vmsplice() failed\n"); - goto close_pipe; - } - } - - ret = fork(); - if (ret < 0) { - ksft_test_result_fail("fork() failed\n"); - goto close_pipe; - } else if (!ret) { - write(comm_pipes.child_ready[1], "0", 1); - while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) - ; - /* Modify page content in the child. */ - memset(mem, 0xff, size); - exit(0); - } - - if (!before_fork) { - transferred = vmsplice(fds[1], &iov, 1, 0); - if (transferred <= 0) { - ksft_test_result_fail("vmsplice() failed\n"); - wait(&ret); - goto close_pipe; - } - } - - while (read(comm_pipes.child_ready[0], &buf, 1) != 1) - ; - if (munmap(mem, size) < 0) { - ksft_test_result_fail("munmap() failed\n"); - goto close_pipe; - } - write(comm_pipes.parent_ready[1], "0", 1); - - /* Wait until the child is done writing. */ - wait(&ret); - if (!WIFEXITED(ret)) { - ksft_test_result_fail("wait() failed\n"); - goto close_pipe; - } - - /* See if we still read the old values. */ - for (total = 0; total < transferred; total += cur) { - cur = read(fds[0], new + total, transferred - total); - if (cur < 0) { - ksft_test_result_fail("read() failed\n"); - goto close_pipe; - } - } - - ksft_test_result(!memcmp(old, new, transferred), - "No leak from child into parent\n"); -close_pipe: - close(fds[0]); - close(fds[1]); -close_comm_pipes: - close_comm_pipes(&comm_pipes); -free: - free(old); - free(new); -} - -static void test_vmsplice_before_fork(char *mem, size_t size) -{ - do_test_vmsplice_in_parent(mem, size, true); -} - -static void test_vmsplice_after_fork(char *mem, size_t size) -{ - do_test_vmsplice_in_parent(mem, size, false); -} - -#ifdef LOCAL_CONFIG_HAVE_LIBURING -static void do_test_iouring(char *mem, size_t size, bool use_fork) -{ - struct comm_pipes comm_pipes; - struct io_uring_cqe *cqe; - struct io_uring_sqe *sqe; - struct io_uring ring; - ssize_t cur, total; - struct iovec iov; - char *buf, *tmp; - int ret, fd; - FILE *file; - - ret = setup_comm_pipes(&comm_pipes); - if (ret) { - ksft_test_result_fail("pipe() failed\n"); - return; - } - - file = tmpfile(); - if (!file) { - ksft_test_result_fail("tmpfile() failed\n"); - goto close_comm_pipes; - } - fd = fileno(file); - assert(fd); - - tmp = malloc(size); - if (!tmp) { - ksft_test_result_fail("malloc() failed\n"); - goto close_file; - } - - /* Skip on errors, as we might just lack kernel support. */ - ret = io_uring_queue_init(1, &ring, 0); - if (ret < 0) { - ksft_test_result_skip("io_uring_queue_init() failed\n"); - goto free_tmp; - } - - /* - * Register the range as a fixed buffer. This will FOLL_WRITE | FOLL_PIN - * | FOLL_LONGTERM the range. - * - * Skip on errors, as we might just lack kernel support or might not - * have sufficient MEMLOCK permissions. - */ - iov.iov_base = mem; - iov.iov_len = size; - ret = io_uring_register_buffers(&ring, &iov, 1); - if (ret) { - ksft_test_result_skip("io_uring_register_buffers() failed\n"); - goto queue_exit; - } - - if (use_fork) { - /* - * fork() and keep the child alive until we're done. Note that - * we expect the pinned page to not get shared with the child. - */ - ret = fork(); - if (ret < 0) { - ksft_test_result_fail("fork() failed\n"); - goto unregister_buffers; - } else if (!ret) { - write(comm_pipes.child_ready[1], "0", 1); - while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) - ; - exit(0); - } - - while (read(comm_pipes.child_ready[0], &buf, 1) != 1) - ; - } else { - /* - * Map the page R/O into the page table. Enable softdirty - * tracking to stop the page from getting mapped R/W immediately - * again by mprotect() optimizations. Note that we don't have an - * easy way to test if that worked (the pagemap does not export - * if the page is mapped R/O vs. R/W). - */ - ret = mprotect(mem, size, PROT_READ); - clear_softdirty(); - ret |= mprotect(mem, size, PROT_READ | PROT_WRITE); - if (ret) { - ksft_test_result_fail("mprotect() failed\n"); - goto unregister_buffers; - } - } - - /* - * Modify the page and write page content as observed by the fixed - * buffer pin to the file so we can verify it. - */ - memset(mem, 0xff, size); - sqe = io_uring_get_sqe(&ring); - if (!sqe) { - ksft_test_result_fail("io_uring_get_sqe() failed\n"); - goto quit_child; - } - io_uring_prep_write_fixed(sqe, fd, mem, size, 0, 0); - - ret = io_uring_submit(&ring); - if (ret < 0) { - ksft_test_result_fail("io_uring_submit() failed\n"); - goto quit_child; - } - - ret = io_uring_wait_cqe(&ring, &cqe); - if (ret < 0) { - ksft_test_result_fail("io_uring_wait_cqe() failed\n"); - goto quit_child; - } - - if (cqe->res != size) { - ksft_test_result_fail("write_fixed failed\n"); - goto quit_child; - } - io_uring_cqe_seen(&ring, cqe); - - /* Read back the file content to the temporary buffer. */ - total = 0; - while (total < size) { - cur = pread(fd, tmp + total, size - total, total); - if (cur < 0) { - ksft_test_result_fail("pread() failed\n"); - goto quit_child; - } - total += cur; - } - - /* Finally, check if we read what we expected. */ - ksft_test_result(!memcmp(mem, tmp, size), - "Longterm R/W pin is reliable\n"); - -quit_child: - if (use_fork) { - write(comm_pipes.parent_ready[1], "0", 1); - wait(&ret); - } -unregister_buffers: - io_uring_unregister_buffers(&ring); -queue_exit: - io_uring_queue_exit(&ring); -free_tmp: - free(tmp); -close_file: - fclose(file); -close_comm_pipes: - close_comm_pipes(&comm_pipes); -} - -static void test_iouring_ro(char *mem, size_t size) -{ - do_test_iouring(mem, size, false); -} - -static void test_iouring_fork(char *mem, size_t size) -{ - do_test_iouring(mem, size, true); -} - -#endif /* LOCAL_CONFIG_HAVE_LIBURING */ - -enum ro_pin_test { - RO_PIN_TEST, - RO_PIN_TEST_SHARED, - RO_PIN_TEST_PREVIOUSLY_SHARED, - RO_PIN_TEST_RO_EXCLUSIVE, -}; - -static void do_test_ro_pin(char *mem, size_t size, enum ro_pin_test test, - bool fast) -{ - struct pin_longterm_test args; - struct comm_pipes comm_pipes; - char *tmp, buf; - __u64 tmp_val; - int ret; - - if (gup_fd < 0) { - ksft_test_result_skip("gup_test not available\n"); - return; - } - - tmp = malloc(size); - if (!tmp) { - ksft_test_result_fail("malloc() failed\n"); - return; - } - - ret = setup_comm_pipes(&comm_pipes); - if (ret) { - ksft_test_result_fail("pipe() failed\n"); - goto free_tmp; - } - - switch (test) { - case RO_PIN_TEST: - break; - case RO_PIN_TEST_SHARED: - case RO_PIN_TEST_PREVIOUSLY_SHARED: - /* - * Share the pages with our child. As the pages are not pinned, - * this should just work. - */ - ret = fork(); - if (ret < 0) { - ksft_test_result_fail("fork() failed\n"); - goto close_comm_pipes; - } else if (!ret) { - write(comm_pipes.child_ready[1], "0", 1); - while (read(comm_pipes.parent_ready[0], &buf, 1) != 1) - ; - exit(0); - } - - /* Wait until our child is ready. */ - while (read(comm_pipes.child_ready[0], &buf, 1) != 1) - ; - - if (test == RO_PIN_TEST_PREVIOUSLY_SHARED) { - /* - * Tell the child to quit now and wait until it quit. - * The pages should now be mapped R/O into our page - * tables, but they are no longer shared. - */ - write(comm_pipes.parent_ready[1], "0", 1); - wait(&ret); - if (!WIFEXITED(ret)) - ksft_print_msg("[INFO] wait() failed\n"); - } - break; - case RO_PIN_TEST_RO_EXCLUSIVE: - /* - * Map the page R/O into the page table. Enable softdirty - * tracking to stop the page from getting mapped R/W immediately - * again by mprotect() optimizations. Note that we don't have an - * easy way to test if that worked (the pagemap does not export - * if the page is mapped R/O vs. R/W). - */ - ret = mprotect(mem, size, PROT_READ); - clear_softdirty(); - ret |= mprotect(mem, size, PROT_READ | PROT_WRITE); - if (ret) { - ksft_test_result_fail("mprotect() failed\n"); - goto close_comm_pipes; - } - break; - default: - assert(false); - } - - /* Take a R/O pin. This should trigger unsharing. */ - args.addr = (__u64)(uintptr_t)mem; - args.size = size; - args.flags = fast ? PIN_LONGTERM_TEST_FLAG_USE_FAST : 0; - ret = ioctl(gup_fd, PIN_LONGTERM_TEST_START, &args); - if (ret) { - if (errno == EINVAL) - ksft_test_result_skip("PIN_LONGTERM_TEST_START failed\n"); - else - ksft_test_result_fail("PIN_LONGTERM_TEST_START failed\n"); - goto wait; - } - - /* Modify the page. */ - memset(mem, 0xff, size); - - /* - * Read back the content via the pin to the temporary buffer and - * test if we observed the modification. - */ - tmp_val = (__u64)(uintptr_t)tmp; - ret = ioctl(gup_fd, PIN_LONGTERM_TEST_READ, &tmp_val); - if (ret) - ksft_test_result_fail("PIN_LONGTERM_TEST_READ failed\n"); - else - ksft_test_result(!memcmp(mem, tmp, size), - "Longterm R/O pin is reliable\n"); - - ret = ioctl(gup_fd, PIN_LONGTERM_TEST_STOP); - if (ret) - ksft_print_msg("[INFO] PIN_LONGTERM_TEST_STOP failed\n"); -wait: - switch (test) { - case RO_PIN_TEST_SHARED: - write(comm_pipes.parent_ready[1], "0", 1); - wait(&ret); - if (!WIFEXITED(ret)) - ksft_print_msg("[INFO] wait() failed\n"); - break; - default: - break; - } -close_comm_pipes: - close_comm_pipes(&comm_pipes); -free_tmp: - free(tmp); -} - -static void test_ro_pin_on_shared(char *mem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, false); -} - -static void test_ro_fast_pin_on_shared(char *mem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST_SHARED, true); -} - -static void test_ro_pin_on_ro_previously_shared(char *mem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, false); -} - -static void test_ro_fast_pin_on_ro_previously_shared(char *mem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST_PREVIOUSLY_SHARED, true); -} - -static void test_ro_pin_on_ro_exclusive(char *mem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, false); -} - -static void test_ro_fast_pin_on_ro_exclusive(char *mem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST_RO_EXCLUSIVE, true); -} - -typedef void (*test_fn)(char *mem, size_t size); - -static void do_run_with_base_page(test_fn fn, bool swapout) -{ - char *mem; - int ret; - - mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - return; - } - - ret = madvise(mem, pagesize, MADV_NOHUGEPAGE); - /* Ignore if not around on a kernel. */ - if (ret && errno != EINVAL) { - ksft_test_result_fail("MADV_NOHUGEPAGE failed\n"); - goto munmap; - } - - /* Populate a base page. */ - memset(mem, 0, pagesize); - - if (swapout) { - madvise(mem, pagesize, MADV_PAGEOUT); - if (!pagemap_is_swapped(pagemap_fd, mem)) { - ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); - goto munmap; - } - } - - fn(mem, pagesize); -munmap: - munmap(mem, pagesize); -} - -static void run_with_base_page(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with base page\n", desc); - do_run_with_base_page(fn, false); -} - -static void run_with_base_page_swap(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with swapped out base page\n", desc); - do_run_with_base_page(fn, true); -} - -enum thp_run { - THP_RUN_PMD, - THP_RUN_PMD_SWAPOUT, - THP_RUN_PTE, - THP_RUN_PTE_SWAPOUT, - THP_RUN_SINGLE_PTE, - THP_RUN_SINGLE_PTE_SWAPOUT, - THP_RUN_PARTIAL_MREMAP, - THP_RUN_PARTIAL_SHARED, -}; - -static void do_run_with_thp(test_fn fn, enum thp_run thp_run) -{ - char *mem, *mmap_mem, *tmp, *mremap_mem = MAP_FAILED; - size_t size, mmap_size, mremap_size; - int ret; - - /* For alignment purposes, we need twice the thp size. */ - mmap_size = 2 * thpsize; - mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (mmap_mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - return; - } - - /* We need a THP-aligned memory area. */ - mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1)); - - ret = madvise(mem, thpsize, MADV_HUGEPAGE); - if (ret) { - ksft_test_result_fail("MADV_HUGEPAGE failed\n"); - goto munmap; - } - - /* - * Try to populate a THP. Touch the first sub-page and test if we get - * another sub-page populated automatically. - */ - mem[0] = 0; - if (!pagemap_is_populated(pagemap_fd, mem + pagesize)) { - ksft_test_result_skip("Did not get a THP populated\n"); - goto munmap; - } - memset(mem, 0, thpsize); - - size = thpsize; - switch (thp_run) { - case THP_RUN_PMD: - case THP_RUN_PMD_SWAPOUT: - break; - case THP_RUN_PTE: - case THP_RUN_PTE_SWAPOUT: - /* - * Trigger PTE-mapping the THP by temporarily mapping a single - * subpage R/O. - */ - ret = mprotect(mem + pagesize, pagesize, PROT_READ); - if (ret) { - ksft_test_result_fail("mprotect() failed\n"); - goto munmap; - } - ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); - if (ret) { - ksft_test_result_fail("mprotect() failed\n"); - goto munmap; - } - break; - case THP_RUN_SINGLE_PTE: - case THP_RUN_SINGLE_PTE_SWAPOUT: - /* - * Discard all but a single subpage of that PTE-mapped THP. What - * remains is a single PTE mapping a single subpage. - */ - ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTNEED); - if (ret) { - ksft_test_result_fail("MADV_DONTNEED failed\n"); - goto munmap; - } - size = pagesize; - break; - case THP_RUN_PARTIAL_MREMAP: - /* - * Remap half of the THP. We need some new memory location - * for that. - */ - mremap_size = thpsize / 2; - mremap_mem = mmap(NULL, mremap_size, PROT_NONE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto munmap; - } - tmp = mremap(mem + mremap_size, mremap_size, mremap_size, - MREMAP_MAYMOVE | MREMAP_FIXED, mremap_mem); - if (tmp != mremap_mem) { - ksft_test_result_fail("mremap() failed\n"); - goto munmap; - } - size = mremap_size; - break; - case THP_RUN_PARTIAL_SHARED: - /* - * Share the first page of the THP with a child and quit the - * child. This will result in some parts of the THP never - * have been shared. - */ - ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DONTFORK); - if (ret) { - ksft_test_result_fail("MADV_DONTFORK failed\n"); - goto munmap; - } - ret = fork(); - if (ret < 0) { - ksft_test_result_fail("fork() failed\n"); - goto munmap; - } else if (!ret) { - exit(0); - } - wait(&ret); - /* Allow for sharing all pages again. */ - ret = madvise(mem + pagesize, thpsize - pagesize, MADV_DOFORK); - if (ret) { - ksft_test_result_fail("MADV_DOFORK failed\n"); - goto munmap; - } - break; - default: - assert(false); - } - - switch (thp_run) { - case THP_RUN_PMD_SWAPOUT: - case THP_RUN_PTE_SWAPOUT: - case THP_RUN_SINGLE_PTE_SWAPOUT: - madvise(mem, size, MADV_PAGEOUT); - if (!range_is_swapped(mem, size)) { - ksft_test_result_skip("MADV_PAGEOUT did not work, is swap enabled?\n"); - goto munmap; - } - break; - default: - break; - } - - fn(mem, size); -munmap: - munmap(mmap_mem, mmap_size); - if (mremap_mem != MAP_FAILED) - munmap(mremap_mem, mremap_size); -} - -static void run_with_thp(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with THP\n", desc); - do_run_with_thp(fn, THP_RUN_PMD); -} - -static void run_with_thp_swap(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with swapped-out THP\n", desc); - do_run_with_thp(fn, THP_RUN_PMD_SWAPOUT); -} - -static void run_with_pte_mapped_thp(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with PTE-mapped THP\n", desc); - do_run_with_thp(fn, THP_RUN_PTE); -} - -static void run_with_pte_mapped_thp_swap(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with swapped-out, PTE-mapped THP\n", desc); - do_run_with_thp(fn, THP_RUN_PTE_SWAPOUT); -} - -static void run_with_single_pte_of_thp(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with single PTE of THP\n", desc); - do_run_with_thp(fn, THP_RUN_SINGLE_PTE); -} - -static void run_with_single_pte_of_thp_swap(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with single PTE of swapped-out THP\n", desc); - do_run_with_thp(fn, THP_RUN_SINGLE_PTE_SWAPOUT); -} - -static void run_with_partial_mremap_thp(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with partially mremap()'ed THP\n", desc); - do_run_with_thp(fn, THP_RUN_PARTIAL_MREMAP); -} - -static void run_with_partial_shared_thp(test_fn fn, const char *desc) -{ - ksft_print_msg("[RUN] %s ... with partially shared THP\n", desc); - do_run_with_thp(fn, THP_RUN_PARTIAL_SHARED); -} - -static void run_with_hugetlb(test_fn fn, const char *desc, size_t hugetlbsize) -{ - int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB; - char *mem, *dummy; - - ksft_print_msg("[RUN] %s ... with hugetlb (%zu kB)\n", desc, - hugetlbsize / 1024); - - flags |= __builtin_ctzll(hugetlbsize) << MAP_HUGE_SHIFT; - - mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0); - if (mem == MAP_FAILED) { - ksft_test_result_skip("need more free huge pages\n"); - return; - } - - /* Populate an huge page. */ - memset(mem, 0, hugetlbsize); - - /* - * We need a total of two hugetlb pages to handle COW/unsharing - * properly, otherwise we might get zapped by a SIGBUS. - */ - dummy = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, flags, -1, 0); - if (dummy == MAP_FAILED) { - ksft_test_result_skip("need more free huge pages\n"); - goto munmap; - } - munmap(dummy, hugetlbsize); - - fn(mem, hugetlbsize); -munmap: - munmap(mem, hugetlbsize); -} - -struct test_case { - const char *desc; - test_fn fn; -}; - -/* - * Test cases that are specific to anonymous pages: pages in private mappings - * that may get shared via COW during fork(). - */ -static const struct test_case anon_test_cases[] = { - /* - * Basic COW tests for fork() without any GUP. If we miss to break COW, - * either the child can observe modifications by the parent or the - * other way around. - */ - { - "Basic COW after fork()", - test_cow_in_parent, - }, - /* - * Basic test, but do an additional mprotect(PROT_READ)+ - * mprotect(PROT_READ|PROT_WRITE) in the parent before write access. - */ - { - "Basic COW after fork() with mprotect() optimization", - test_cow_in_parent_mprotect, - }, - /* - * vmsplice() [R/O GUP] + unmap in the child; modify in the parent. If - * we miss to break COW, the child observes modifications by the parent. - * This is CVE-2020-29374 reported by Jann Horn. - */ - { - "vmsplice() + unmap in child", - test_vmsplice_in_child - }, - /* - * vmsplice() test, but do an additional mprotect(PROT_READ)+ - * mprotect(PROT_READ|PROT_WRITE) in the parent before write access. - */ - { - "vmsplice() + unmap in child with mprotect() optimization", - test_vmsplice_in_child_mprotect - }, - /* - * vmsplice() [R/O GUP] in parent before fork(), unmap in parent after - * fork(); modify in the child. If we miss to break COW, the parent - * observes modifications by the child. - */ - { - "vmsplice() before fork(), unmap in parent after fork()", - test_vmsplice_before_fork, - }, - /* - * vmsplice() [R/O GUP] + unmap in parent after fork(); modify in the - * child. If we miss to break COW, the parent observes modifications by - * the child. - */ - { - "vmsplice() + unmap in parent after fork()", - test_vmsplice_after_fork, - }, -#ifdef LOCAL_CONFIG_HAVE_LIBURING - /* - * Take a R/W longterm pin and then map the page R/O into the page - * table to trigger a write fault on next access. When modifying the - * page, the page content must be visible via the pin. - */ - { - "R/O-mapping a page registered as iouring fixed buffer", - test_iouring_ro, - }, - /* - * Take a R/W longterm pin and then fork() a child. When modifying the - * page, the page content must be visible via the pin. We expect the - * pinned page to not get shared with the child. - */ - { - "fork() with an iouring fixed buffer", - test_iouring_fork, - }, - -#endif /* LOCAL_CONFIG_HAVE_LIBURING */ - /* - * Take a R/O longterm pin on a R/O-mapped shared anonymous page. - * When modifying the page via the page table, the page content change - * must be visible via the pin. - */ - { - "R/O GUP pin on R/O-mapped shared page", - test_ro_pin_on_shared, - }, - /* Same as above, but using GUP-fast. */ - { - "R/O GUP-fast pin on R/O-mapped shared page", - test_ro_fast_pin_on_shared, - }, - /* - * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page that - * was previously shared. When modifying the page via the page table, - * the page content change must be visible via the pin. - */ - { - "R/O GUP pin on R/O-mapped previously-shared page", - test_ro_pin_on_ro_previously_shared, - }, - /* Same as above, but using GUP-fast. */ - { - "R/O GUP-fast pin on R/O-mapped previously-shared page", - test_ro_fast_pin_on_ro_previously_shared, - }, - /* - * Take a R/O longterm pin on a R/O-mapped exclusive anonymous page. - * When modifying the page via the page table, the page content change - * must be visible via the pin. - */ - { - "R/O GUP pin on R/O-mapped exclusive page", - test_ro_pin_on_ro_exclusive, - }, - /* Same as above, but using GUP-fast. */ - { - "R/O GUP-fast pin on R/O-mapped exclusive page", - test_ro_fast_pin_on_ro_exclusive, - }, -}; - -static void run_anon_test_case(struct test_case const *test_case) -{ - int i; - - run_with_base_page(test_case->fn, test_case->desc); - run_with_base_page_swap(test_case->fn, test_case->desc); - if (thpsize) { - run_with_thp(test_case->fn, test_case->desc); - run_with_thp_swap(test_case->fn, test_case->desc); - run_with_pte_mapped_thp(test_case->fn, test_case->desc); - run_with_pte_mapped_thp_swap(test_case->fn, test_case->desc); - run_with_single_pte_of_thp(test_case->fn, test_case->desc); - run_with_single_pte_of_thp_swap(test_case->fn, test_case->desc); - run_with_partial_mremap_thp(test_case->fn, test_case->desc); - run_with_partial_shared_thp(test_case->fn, test_case->desc); - } - for (i = 0; i < nr_hugetlbsizes; i++) - run_with_hugetlb(test_case->fn, test_case->desc, - hugetlbsizes[i]); -} - -static void run_anon_test_cases(void) -{ - int i; - - ksft_print_msg("[INFO] Anonymous memory tests in private mappings\n"); - - for (i = 0; i < ARRAY_SIZE(anon_test_cases); i++) - run_anon_test_case(&anon_test_cases[i]); -} - -static int tests_per_anon_test_case(void) -{ - int tests = 2 + nr_hugetlbsizes; - - if (thpsize) - tests += 8; - return tests; -} - -enum anon_thp_collapse_test { - ANON_THP_COLLAPSE_UNSHARED, - ANON_THP_COLLAPSE_FULLY_SHARED, - ANON_THP_COLLAPSE_LOWER_SHARED, - ANON_THP_COLLAPSE_UPPER_SHARED, -}; - -static void do_test_anon_thp_collapse(char *mem, size_t size, - enum anon_thp_collapse_test test) -{ - struct comm_pipes comm_pipes; - char buf; - int ret; - - ret = setup_comm_pipes(&comm_pipes); - if (ret) { - ksft_test_result_fail("pipe() failed\n"); - return; - } - - /* - * Trigger PTE-mapping the THP by temporarily mapping a single subpage - * R/O, such that we can try collapsing it later. - */ - ret = mprotect(mem + pagesize, pagesize, PROT_READ); - if (ret) { - ksft_test_result_fail("mprotect() failed\n"); - goto close_comm_pipes; - } - ret = mprotect(mem + pagesize, pagesize, PROT_READ | PROT_WRITE); - if (ret) { - ksft_test_result_fail("mprotect() failed\n"); - goto close_comm_pipes; - } - - switch (test) { - case ANON_THP_COLLAPSE_UNSHARED: - /* Collapse before actually COW-sharing the page. */ - ret = madvise(mem, size, MADV_COLLAPSE); - if (ret) { - ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", - strerror(errno)); - goto close_comm_pipes; - } - break; - case ANON_THP_COLLAPSE_FULLY_SHARED: - /* COW-share the full PTE-mapped THP. */ - break; - case ANON_THP_COLLAPSE_LOWER_SHARED: - /* Don't COW-share the upper part of the THP. */ - ret = madvise(mem + size / 2, size / 2, MADV_DONTFORK); - if (ret) { - ksft_test_result_fail("MADV_DONTFORK failed\n"); - goto close_comm_pipes; - } - break; - case ANON_THP_COLLAPSE_UPPER_SHARED: - /* Don't COW-share the lower part of the THP. */ - ret = madvise(mem, size / 2, MADV_DONTFORK); - if (ret) { - ksft_test_result_fail("MADV_DONTFORK failed\n"); - goto close_comm_pipes; - } - break; - default: - assert(false); - } - - ret = fork(); - if (ret < 0) { - ksft_test_result_fail("fork() failed\n"); - goto close_comm_pipes; - } else if (!ret) { - switch (test) { - case ANON_THP_COLLAPSE_UNSHARED: - case ANON_THP_COLLAPSE_FULLY_SHARED: - exit(child_memcmp_fn(mem, size, &comm_pipes)); - break; - case ANON_THP_COLLAPSE_LOWER_SHARED: - exit(child_memcmp_fn(mem, size / 2, &comm_pipes)); - break; - case ANON_THP_COLLAPSE_UPPER_SHARED: - exit(child_memcmp_fn(mem + size / 2, size / 2, - &comm_pipes)); - break; - default: - assert(false); - } - } - - while (read(comm_pipes.child_ready[0], &buf, 1) != 1) - ; - - switch (test) { - case ANON_THP_COLLAPSE_UNSHARED: - break; - case ANON_THP_COLLAPSE_UPPER_SHARED: - case ANON_THP_COLLAPSE_LOWER_SHARED: - /* - * Revert MADV_DONTFORK such that we merge the VMAs and are - * able to actually collapse. - */ - ret = madvise(mem, size, MADV_DOFORK); - if (ret) { - ksft_test_result_fail("MADV_DOFORK failed\n"); - write(comm_pipes.parent_ready[1], "0", 1); - wait(&ret); - goto close_comm_pipes; - } - /* FALLTHROUGH */ - case ANON_THP_COLLAPSE_FULLY_SHARED: - /* Collapse before anyone modified the COW-shared page. */ - ret = madvise(mem, size, MADV_COLLAPSE); - if (ret) { - ksft_test_result_skip("MADV_COLLAPSE failed: %s\n", - strerror(errno)); - write(comm_pipes.parent_ready[1], "0", 1); - wait(&ret); - goto close_comm_pipes; - } - break; - default: - assert(false); - } - - /* Modify the page. */ - memset(mem, 0xff, size); - write(comm_pipes.parent_ready[1], "0", 1); - - wait(&ret); - if (WIFEXITED(ret)) - ret = WEXITSTATUS(ret); - else - ret = -EINVAL; - - ksft_test_result(!ret, "No leak from parent into child\n"); -close_comm_pipes: - close_comm_pipes(&comm_pipes); -} - -static void test_anon_thp_collapse_unshared(char *mem, size_t size) -{ - do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UNSHARED); -} - -static void test_anon_thp_collapse_fully_shared(char *mem, size_t size) -{ - do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_FULLY_SHARED); -} - -static void test_anon_thp_collapse_lower_shared(char *mem, size_t size) -{ - do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_LOWER_SHARED); -} - -static void test_anon_thp_collapse_upper_shared(char *mem, size_t size) -{ - do_test_anon_thp_collapse(mem, size, ANON_THP_COLLAPSE_UPPER_SHARED); -} - -/* - * Test cases that are specific to anonymous THP: pages in private mappings - * that may get shared via COW during fork(). - */ -static const struct test_case anon_thp_test_cases[] = { - /* - * Basic COW test for fork() without any GUP when collapsing a THP - * before fork(). - * - * Re-mapping a PTE-mapped anon THP using a single PMD ("in-place - * collapse") might easily get COW handling wrong when not collapsing - * exclusivity information properly. - */ - { - "Basic COW after fork() when collapsing before fork()", - test_anon_thp_collapse_unshared, - }, - /* Basic COW test, but collapse after COW-sharing a full THP. */ - { - "Basic COW after fork() when collapsing after fork() (fully shared)", - test_anon_thp_collapse_fully_shared, - }, - /* - * Basic COW test, but collapse after COW-sharing the lower half of a - * THP. - */ - { - "Basic COW after fork() when collapsing after fork() (lower shared)", - test_anon_thp_collapse_lower_shared, - }, - /* - * Basic COW test, but collapse after COW-sharing the upper half of a - * THP. - */ - { - "Basic COW after fork() when collapsing after fork() (upper shared)", - test_anon_thp_collapse_upper_shared, - }, -}; - -static void run_anon_thp_test_cases(void) -{ - int i; - - if (!thpsize) - return; - - ksft_print_msg("[INFO] Anonymous THP tests\n"); - - for (i = 0; i < ARRAY_SIZE(anon_thp_test_cases); i++) { - struct test_case const *test_case = &anon_thp_test_cases[i]; - - ksft_print_msg("[RUN] %s\n", test_case->desc); - do_run_with_thp(test_case->fn, THP_RUN_PMD); - } -} - -static int tests_per_anon_thp_test_case(void) -{ - return thpsize ? 1 : 0; -} - -typedef void (*non_anon_test_fn)(char *mem, const char *smem, size_t size); - -static void test_cow(char *mem, const char *smem, size_t size) -{ - char *old = malloc(size); - - /* Backup the original content. */ - memcpy(old, smem, size); - - /* Modify the page. */ - memset(mem, 0xff, size); - - /* See if we still read the old values via the other mapping. */ - ksft_test_result(!memcmp(smem, old, size), - "Other mapping not modified\n"); - free(old); -} - -static void test_ro_pin(char *mem, const char *smem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST, false); -} - -static void test_ro_fast_pin(char *mem, const char *smem, size_t size) -{ - do_test_ro_pin(mem, size, RO_PIN_TEST, true); -} - -static void run_with_zeropage(non_anon_test_fn fn, const char *desc) -{ - char *mem, *smem, tmp; - - ksft_print_msg("[RUN] %s ... with shared zeropage\n", desc); - - mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANON, -1, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - return; - } - - smem = mmap(NULL, pagesize, PROT_READ, MAP_PRIVATE | MAP_ANON, -1, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto munmap; - } - - /* Read from the page to populate the shared zeropage. */ - tmp = *mem + *smem; - asm volatile("" : "+r" (tmp)); - - fn(mem, smem, pagesize); -munmap: - munmap(mem, pagesize); - if (smem != MAP_FAILED) - munmap(smem, pagesize); -} - -static void run_with_huge_zeropage(non_anon_test_fn fn, const char *desc) -{ - char *mem, *smem, *mmap_mem, *mmap_smem, tmp; - size_t mmap_size; - int ret; - - ksft_print_msg("[RUN] %s ... with huge zeropage\n", desc); - - if (!has_huge_zeropage) { - ksft_test_result_skip("Huge zeropage not enabled\n"); - return; - } - - /* For alignment purposes, we need twice the thp size. */ - mmap_size = 2 * thpsize; - mmap_mem = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (mmap_mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - return; - } - mmap_smem = mmap(NULL, mmap_size, PROT_READ, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (mmap_smem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto munmap; - } - - /* We need a THP-aligned memory area. */ - mem = (char *)(((uintptr_t)mmap_mem + thpsize) & ~(thpsize - 1)); - smem = (char *)(((uintptr_t)mmap_smem + thpsize) & ~(thpsize - 1)); - - ret = madvise(mem, thpsize, MADV_HUGEPAGE); - ret |= madvise(smem, thpsize, MADV_HUGEPAGE); - if (ret) { - ksft_test_result_fail("MADV_HUGEPAGE failed\n"); - goto munmap; - } - - /* - * Read from the memory to populate the huge shared zeropage. Read from - * the first sub-page and test if we get another sub-page populated - * automatically. - */ - tmp = *mem + *smem; - asm volatile("" : "+r" (tmp)); - if (!pagemap_is_populated(pagemap_fd, mem + pagesize) || - !pagemap_is_populated(pagemap_fd, smem + pagesize)) { - ksft_test_result_skip("Did not get THPs populated\n"); - goto munmap; - } - - fn(mem, smem, thpsize); -munmap: - munmap(mmap_mem, mmap_size); - if (mmap_smem != MAP_FAILED) - munmap(mmap_smem, mmap_size); -} - -static void run_with_memfd(non_anon_test_fn fn, const char *desc) -{ - char *mem, *smem, tmp; - int fd; - - ksft_print_msg("[RUN] %s ... with memfd\n", desc); - - fd = memfd_create("test", 0); - if (fd < 0) { - ksft_test_result_fail("memfd_create() failed\n"); - return; - } - - /* File consists of a single page filled with zeroes. */ - if (fallocate(fd, 0, 0, pagesize)) { - ksft_test_result_fail("fallocate() failed\n"); - goto close; - } - - /* Create a private mapping of the memfd. */ - mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto close; - } - smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto munmap; - } - - /* Fault the page in. */ - tmp = *mem + *smem; - asm volatile("" : "+r" (tmp)); - - fn(mem, smem, pagesize); -munmap: - munmap(mem, pagesize); - if (smem != MAP_FAILED) - munmap(smem, pagesize); -close: - close(fd); -} - -static void run_with_tmpfile(non_anon_test_fn fn, const char *desc) -{ - char *mem, *smem, tmp; - FILE *file; - int fd; - - ksft_print_msg("[RUN] %s ... with tmpfile\n", desc); - - file = tmpfile(); - if (!file) { - ksft_test_result_fail("tmpfile() failed\n"); - return; - } - - fd = fileno(file); - if (fd < 0) { - ksft_test_result_skip("fileno() failed\n"); - return; - } - - /* File consists of a single page filled with zeroes. */ - if (fallocate(fd, 0, 0, pagesize)) { - ksft_test_result_fail("fallocate() failed\n"); - goto close; - } - - /* Create a private mapping of the memfd. */ - mem = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto close; - } - smem = mmap(NULL, pagesize, PROT_READ, MAP_SHARED, fd, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto munmap; - } - - /* Fault the page in. */ - tmp = *mem + *smem; - asm volatile("" : "+r" (tmp)); - - fn(mem, smem, pagesize); -munmap: - munmap(mem, pagesize); - if (smem != MAP_FAILED) - munmap(smem, pagesize); -close: - fclose(file); -} - -static void run_with_memfd_hugetlb(non_anon_test_fn fn, const char *desc, - size_t hugetlbsize) -{ - int flags = MFD_HUGETLB; - char *mem, *smem, tmp; - int fd; - - ksft_print_msg("[RUN] %s ... with memfd hugetlb (%zu kB)\n", desc, - hugetlbsize / 1024); - - flags |= __builtin_ctzll(hugetlbsize) << MFD_HUGE_SHIFT; - - fd = memfd_create("test", flags); - if (fd < 0) { - ksft_test_result_skip("memfd_create() failed\n"); - return; - } - - /* File consists of a single page filled with zeroes. */ - if (fallocate(fd, 0, 0, hugetlbsize)) { - ksft_test_result_skip("need more free huge pages\n"); - goto close; - } - - /* Create a private mapping of the memfd. */ - mem = mmap(NULL, hugetlbsize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, - 0); - if (mem == MAP_FAILED) { - ksft_test_result_skip("need more free huge pages\n"); - goto close; - } - smem = mmap(NULL, hugetlbsize, PROT_READ, MAP_SHARED, fd, 0); - if (mem == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - goto munmap; - } - - /* Fault the page in. */ - tmp = *mem + *smem; - asm volatile("" : "+r" (tmp)); - - fn(mem, smem, hugetlbsize); -munmap: - munmap(mem, hugetlbsize); - if (mem != MAP_FAILED) - munmap(smem, hugetlbsize); -close: - close(fd); -} - -struct non_anon_test_case { - const char *desc; - non_anon_test_fn fn; -}; - -/* - * Test cases that target any pages in private mappings that are not anonymous: - * pages that may get shared via COW ndependent of fork(). This includes - * the shared zeropage(s), pagecache pages, ... - */ -static const struct non_anon_test_case non_anon_test_cases[] = { - /* - * Basic COW test without any GUP. If we miss to break COW, changes are - * visible via other private/shared mappings. - */ - { - "Basic COW", - test_cow, - }, - /* - * Take a R/O longterm pin. When modifying the page via the page table, - * the page content change must be visible via the pin. - */ - { - "R/O longterm GUP pin", - test_ro_pin, - }, - /* Same as above, but using GUP-fast. */ - { - "R/O longterm GUP-fast pin", - test_ro_fast_pin, - }, -}; - -static void run_non_anon_test_case(struct non_anon_test_case const *test_case) -{ - int i; - - run_with_zeropage(test_case->fn, test_case->desc); - run_with_memfd(test_case->fn, test_case->desc); - run_with_tmpfile(test_case->fn, test_case->desc); - if (thpsize) - run_with_huge_zeropage(test_case->fn, test_case->desc); - for (i = 0; i < nr_hugetlbsizes; i++) - run_with_memfd_hugetlb(test_case->fn, test_case->desc, - hugetlbsizes[i]); -} - -static void run_non_anon_test_cases(void) -{ - int i; - - ksft_print_msg("[RUN] Non-anonymous memory tests in private mappings\n"); - - for (i = 0; i < ARRAY_SIZE(non_anon_test_cases); i++) - run_non_anon_test_case(&non_anon_test_cases[i]); -} - -static int tests_per_non_anon_test_case(void) -{ - int tests = 3 + nr_hugetlbsizes; - - if (thpsize) - tests += 1; - return tests; -} - -int main(int argc, char **argv) -{ - int err; - - pagesize = getpagesize(); - detect_thpsize(); - detect_hugetlbsizes(); - detect_huge_zeropage(); - - ksft_print_header(); - ksft_set_plan(ARRAY_SIZE(anon_test_cases) * tests_per_anon_test_case() + - ARRAY_SIZE(anon_thp_test_cases) * tests_per_anon_thp_test_case() + - ARRAY_SIZE(non_anon_test_cases) * tests_per_non_anon_test_case()); - - gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); - pagemap_fd = open("/proc/self/pagemap", O_RDONLY); - if (pagemap_fd < 0) - ksft_exit_fail_msg("opening pagemap failed\n"); - - run_anon_test_cases(); - run_anon_thp_test_cases(); - run_non_anon_test_cases(); - - err = ksft_get_fail_cnt(); - if (err) - ksft_exit_fail_msg("%d out of %d tests failed\n", - err, ksft_test_num()); - return ksft_exit_pass(); -} --- a/tools/testing/selftests/vm/.gitignore +++ /dev/null @@ -1,38 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -cow -hugepage-mmap -hugepage-mremap -hugepage-shm -hugepage-vmemmap -hugetlb-madvise -khugepaged -map_hugetlb -map_populate -thuge-gen -compaction_test -migration -mlock2-tests -mrelease_test -mremap_dontunmap -mremap_test -on-fault-limit -transhuge-stress -protection_keys -protection_keys_32 -protection_keys_64 -madv_populate -userfaultfd -mlock-intersect-test -mlock-random-test -virtual_address_range -gup_test -va_128TBswitch -map_fixed_noreplace -write_to_hugetlbfs -hmm-tests -memfd_secret -soft-dirty -split_huge_page_test -ksm_tests -local_config.h -local_config.mk --- a/tools/testing/selftests/vm/gup_test.c +++ /dev/null @@ -1,271 +0,0 @@ -#include <fcntl.h> -#include <errno.h> -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> -#include <dirent.h> -#include <sys/ioctl.h> -#include <sys/mman.h> -#include <sys/stat.h> -#include <sys/types.h> -#include <pthread.h> -#include <assert.h> -#include <mm/gup_test.h> -#include "../kselftest.h" - -#include "util.h" - -#define MB (1UL << 20) - -/* Just the flags we need, copied from mm.h: */ -#define FOLL_WRITE 0x01 /* check pte is writable */ -#define FOLL_TOUCH 0x02 /* mark page accessed */ - -#define GUP_TEST_FILE "/sys/kernel/debug/gup_test" - -static unsigned long cmd = GUP_FAST_BENCHMARK; -static int gup_fd, repeats = 1; -static unsigned long size = 128 * MB; -/* Serialize prints */ -static pthread_mutex_t print_mutex = PTHREAD_MUTEX_INITIALIZER; - -static char *cmd_to_str(unsigned long cmd) -{ - switch (cmd) { - case GUP_FAST_BENCHMARK: - return "GUP_FAST_BENCHMARK"; - case PIN_FAST_BENCHMARK: - return "PIN_FAST_BENCHMARK"; - case PIN_LONGTERM_BENCHMARK: - return "PIN_LONGTERM_BENCHMARK"; - case GUP_BASIC_TEST: - return "GUP_BASIC_TEST"; - case PIN_BASIC_TEST: - return "PIN_BASIC_TEST"; - case DUMP_USER_PAGES_TEST: - return "DUMP_USER_PAGES_TEST"; - } - return "Unknown command"; -} - -void *gup_thread(void *data) -{ - struct gup_test gup = *(struct gup_test *)data; - int i; - - /* Only report timing information on the *_BENCHMARK commands: */ - if ((cmd == PIN_FAST_BENCHMARK) || (cmd == GUP_FAST_BENCHMARK) || - (cmd == PIN_LONGTERM_BENCHMARK)) { - for (i = 0; i < repeats; i++) { - gup.size = size; - if (ioctl(gup_fd, cmd, &gup)) - perror("ioctl"), exit(1); - - pthread_mutex_lock(&print_mutex); - printf("%s: Time: get:%lld put:%lld us", - cmd_to_str(cmd), gup.get_delta_usec, - gup.put_delta_usec); - if (gup.size != size) - printf(", truncated (size: %lld)", gup.size); - printf("\n"); - pthread_mutex_unlock(&print_mutex); - } - } else { - gup.size = size; - if (ioctl(gup_fd, cmd, &gup)) { - perror("ioctl"); - exit(1); - } - - pthread_mutex_lock(&print_mutex); - printf("%s: done\n", cmd_to_str(cmd)); - if (gup.size != size) - printf("Truncated (size: %lld)\n", gup.size); - pthread_mutex_unlock(&print_mutex); - } - - return NULL; -} - -int main(int argc, char **argv) -{ - struct gup_test gup = { 0 }; - int filed, i, opt, nr_pages = 1, thp = -1, write = 1, nthreads = 1, ret; - int flags = MAP_PRIVATE, touch = 0; - char *file = "/dev/zero"; - pthread_t *tid; - char *p; - - while ((opt = getopt(argc, argv, "m:r:n:F:f:abcj:tTLUuwWSHpz")) != -1) { - switch (opt) { - case 'a': - cmd = PIN_FAST_BENCHMARK; - break; - case 'b': - cmd = PIN_BASIC_TEST; - break; - case 'L': - cmd = PIN_LONGTERM_BENCHMARK; - break; - case 'c': - cmd = DUMP_USER_PAGES_TEST; - /* - * Dump page 0 (index 1). May be overridden later, by - * user's non-option arguments. - * - * .which_pages is zero-based, so that zero can mean "do - * nothing". - */ - gup.which_pages[0] = 1; - break; - case 'p': - /* works only with DUMP_USER_PAGES_TEST */ - gup.test_flags |= GUP_TEST_FLAG_DUMP_PAGES_USE_PIN; - break; - case 'F': - /* strtol, so you can pass flags in hex form */ - gup.gup_flags = strtol(optarg, 0, 0); - break; - case 'j': - nthreads = atoi(optarg); - break; - case 'm': - size = atoi(optarg) * MB; - break; - case 'r': - repeats = atoi(optarg); - break; - case 'n': - nr_pages = atoi(optarg); - break; - case 't': - thp = 1; - break; - case 'T': - thp = 0; - break; - case 'U': - cmd = GUP_BASIC_TEST; - break; - case 'u': - cmd = GUP_FAST_BENCHMARK; - break; - case 'w': - write = 1; - break; - case 'W': - write = 0; - break; - case 'f': - file = optarg; - break; - case 'S': - flags &= ~MAP_PRIVATE; - flags |= MAP_SHARED; - break; - case 'H': - flags |= (MAP_HUGETLB | MAP_ANONYMOUS); - break; - case 'z': - /* fault pages in gup, do not fault in userland */ - touch = 1; - break; - default: - return -1; - } - } - - if (optind < argc) { - int extra_arg_count = 0; - /* - * For example: - * - * ./gup_test -c 0 1 0x1001 - * - * ...to dump pages 0, 1, and 4097 - */ - - while ((optind < argc) && - (extra_arg_count < GUP_TEST_MAX_PAGES_TO_DUMP)) { - /* - * Do the 1-based indexing here, so that the user can - * use normal 0-based indexing on the command line. - */ - long page_index = strtol(argv[optind], 0, 0) + 1; - - gup.which_pages[extra_arg_count] = page_index; - extra_arg_count++; - optind++; - } - } - - filed = open(file, O_RDWR|O_CREAT); - if (filed < 0) { - perror("open"); - exit(filed); - } - - gup.nr_pages_per_call = nr_pages; - if (write) - gup.gup_flags |= FOLL_WRITE; - - gup_fd = open(GUP_TEST_FILE, O_RDWR); - if (gup_fd == -1) { - switch (errno) { - case EACCES: - if (getuid()) - printf("Please run this test as root\n"); - break; - case ENOENT: - if (opendir("/sys/kernel/debug") == NULL) { - printf("mount debugfs at /sys/kernel/debug\n"); - break; - } - printf("check if CONFIG_GUP_TEST is enabled in kernel config\n"); - break; - default: - perror("failed to open " GUP_TEST_FILE); - break; - } - exit(KSFT_SKIP); - } - - p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0); - if (p == MAP_FAILED) { - perror("mmap"); - exit(1); - } - gup.addr = (unsigned long)p; - - if (thp == 1) - madvise(p, size, MADV_HUGEPAGE); - else if (thp == 0) - madvise(p, size, MADV_NOHUGEPAGE); - - /* - * FOLL_TOUCH, in gup_test, is used as an either/or case: either - * fault pages in from the kernel via FOLL_TOUCH, or fault them - * in here, from user space. This allows comparison of performance - * between those two cases. - */ - if (touch) { - gup.gup_flags |= FOLL_TOUCH; - } else { - for (; (unsigned long)p < gup.addr + size; p += PAGE_SIZE) - p[0] = 0; - } - - tid = malloc(sizeof(pthread_t) * nthreads); - assert(tid); - for (i = 0; i < nthreads; i++) { - ret = pthread_create(&tid[i], NULL, gup_thread, &gup); - assert(ret == 0); - } - for (i = 0; i < nthreads; i++) { - ret = pthread_join(tid[i], NULL); - assert(ret == 0); - } - free(tid); - - return 0; -} --- a/tools/testing/selftests/vm/hmm-tests.c +++ /dev/null @@ -1,2054 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * HMM stands for Heterogeneous Memory Management, it is a helper layer inside - * the linux kernel to help device drivers mirror a process address space in - * the device. This allows the device to use the same address space which - * makes communication and data exchange a lot easier. - * - * This framework's sole purpose is to exercise various code paths inside - * the kernel to make sure that HMM performs as expected and to flush out any - * bugs. - */ - -#include "../kselftest_harness.h" - -#include <errno.h> -#include <fcntl.h> -#include <stdio.h> -#include <stdlib.h> -#include <stdint.h> -#include <unistd.h> -#include <strings.h> -#include <time.h> -#include <pthread.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <sys/mman.h> -#include <sys/ioctl.h> - - -/* - * This is a private UAPI to the kernel test module so it isn't exported - * in the usual include/uapi/... directory. - */ -#include <lib/test_hmm_uapi.h> -#include <mm/gup_test.h> - -struct hmm_buffer { - void *ptr; - void *mirror; - unsigned long size; - int fd; - uint64_t cpages; - uint64_t faults; -}; - -enum { - HMM_PRIVATE_DEVICE_ONE, - HMM_PRIVATE_DEVICE_TWO, - HMM_COHERENCE_DEVICE_ONE, - HMM_COHERENCE_DEVICE_TWO, -}; - -#define TWOMEG (1 << 21) -#define HMM_BUFFER_SIZE (1024 << 12) -#define HMM_PATH_MAX 64 -#define NTIMES 10 - -#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) -/* Just the flags we need, copied from mm.h: */ -#define FOLL_WRITE 0x01 /* check pte is writable */ -#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite */ - -FIXTURE(hmm) -{ - int fd; - unsigned int page_size; - unsigned int page_shift; -}; - -FIXTURE_VARIANT(hmm) -{ - int device_number; -}; - -FIXTURE_VARIANT_ADD(hmm, hmm_device_private) -{ - .device_number = HMM_PRIVATE_DEVICE_ONE, -}; - -FIXTURE_VARIANT_ADD(hmm, hmm_device_coherent) -{ - .device_number = HMM_COHERENCE_DEVICE_ONE, -}; - -FIXTURE(hmm2) -{ - int fd0; - int fd1; - unsigned int page_size; - unsigned int page_shift; -}; - -FIXTURE_VARIANT(hmm2) -{ - int device_number0; - int device_number1; -}; - -FIXTURE_VARIANT_ADD(hmm2, hmm2_device_private) -{ - .device_number0 = HMM_PRIVATE_DEVICE_ONE, - .device_number1 = HMM_PRIVATE_DEVICE_TWO, -}; - -FIXTURE_VARIANT_ADD(hmm2, hmm2_device_coherent) -{ - .device_number0 = HMM_COHERENCE_DEVICE_ONE, - .device_number1 = HMM_COHERENCE_DEVICE_TWO, -}; - -static int hmm_open(int unit) -{ - char pathname[HMM_PATH_MAX]; - int fd; - - snprintf(pathname, sizeof(pathname), "/dev/hmm_dmirror%d", unit); - fd = open(pathname, O_RDWR, 0); - if (fd < 0) - fprintf(stderr, "could not open hmm dmirror driver (%s)\n", - pathname); - return fd; -} - -static bool hmm_is_coherent_type(int dev_num) -{ - return (dev_num >= HMM_COHERENCE_DEVICE_ONE); -} - -FIXTURE_SETUP(hmm) -{ - self->page_size = sysconf(_SC_PAGE_SIZE); - self->page_shift = ffs(self->page_size) - 1; - - self->fd = hmm_open(variant->device_number); - if (self->fd < 0 && hmm_is_coherent_type(variant->device_number)) - SKIP(exit(0), "DEVICE_COHERENT not available"); - ASSERT_GE(self->fd, 0); -} - -FIXTURE_SETUP(hmm2) -{ - self->page_size = sysconf(_SC_PAGE_SIZE); - self->page_shift = ffs(self->page_size) - 1; - - self->fd0 = hmm_open(variant->device_number0); - if (self->fd0 < 0 && hmm_is_coherent_type(variant->device_number0)) - SKIP(exit(0), "DEVICE_COHERENT not available"); - ASSERT_GE(self->fd0, 0); - self->fd1 = hmm_open(variant->device_number1); - ASSERT_GE(self->fd1, 0); -} - -FIXTURE_TEARDOWN(hmm) -{ - int ret = close(self->fd); - - ASSERT_EQ(ret, 0); - self->fd = -1; -} - -FIXTURE_TEARDOWN(hmm2) -{ - int ret = close(self->fd0); - - ASSERT_EQ(ret, 0); - self->fd0 = -1; - - ret = close(self->fd1); - ASSERT_EQ(ret, 0); - self->fd1 = -1; -} - -static int hmm_dmirror_cmd(int fd, - unsigned long request, - struct hmm_buffer *buffer, - unsigned long npages) -{ - struct hmm_dmirror_cmd cmd; - int ret; - - /* Simulate a device reading system memory. */ - cmd.addr = (__u64)buffer->ptr; - cmd.ptr = (__u64)buffer->mirror; - cmd.npages = npages; - - for (;;) { - ret = ioctl(fd, request, &cmd); - if (ret == 0) - break; - if (errno == EINTR) - continue; - return -errno; - } - buffer->cpages = cmd.cpages; - buffer->faults = cmd.faults; - - return 0; -} - -static void hmm_buffer_free(struct hmm_buffer *buffer) -{ - if (buffer == NULL) - return; - - if (buffer->ptr) - munmap(buffer->ptr, buffer->size); - free(buffer->mirror); - free(buffer); -} - -/* - * Create a temporary file that will be deleted on close. - */ -static int hmm_create_file(unsigned long size) -{ - char path[HMM_PATH_MAX]; - int fd; - - strcpy(path, "/tmp"); - fd = open(path, O_TMPFILE | O_EXCL | O_RDWR, 0600); - if (fd >= 0) { - int r; - - do { - r = ftruncate(fd, size); - } while (r == -1 && errno == EINTR); - if (!r) - return fd; - close(fd); - } - return -1; -} - -/* - * Return a random unsigned number. - */ -static unsigned int hmm_random(void) -{ - static int fd = -1; - unsigned int r; - - if (fd < 0) { - fd = open("/dev/urandom", O_RDONLY); - if (fd < 0) { - fprintf(stderr, "%s:%d failed to open /dev/urandom\n", - __FILE__, __LINE__); - return ~0U; - } - } - read(fd, &r, sizeof(r)); - return r; -} - -static void hmm_nanosleep(unsigned int n) -{ - struct timespec t; - - t.tv_sec = 0; - t.tv_nsec = n; - nanosleep(&t, NULL); -} - -static int hmm_migrate_sys_to_dev(int fd, - struct hmm_buffer *buffer, - unsigned long npages) -{ - return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_DEV, buffer, npages); -} - -static int hmm_migrate_dev_to_sys(int fd, - struct hmm_buffer *buffer, - unsigned long npages) -{ - return hmm_dmirror_cmd(fd, HMM_DMIRROR_MIGRATE_TO_SYS, buffer, npages); -} - -/* - * Simple NULL test of device open/close. - */ -TEST_F(hmm, open_close) -{ -} - -/* - * Read private anonymous memory. - */ -TEST_F(hmm, anon_read) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - int val; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* - * Initialize buffer in system memory but leave the first two pages - * zero (pte_none and pfn_zero). - */ - i = 2 * self->page_size / sizeof(*ptr); - for (ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Set buffer permission to read-only. */ - ret = mprotect(buffer->ptr, size, PROT_READ); - ASSERT_EQ(ret, 0); - - /* Populate the CPU page table with a special zero page. */ - val = *(int *)(buffer->ptr + self->page_size); - ASSERT_EQ(val, 0); - - /* Simulate a device reading system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device read. */ - ptr = buffer->mirror; - for (i = 0; i < 2 * self->page_size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], 0); - for (; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Read private anonymous memory which has been protected with - * mprotect() PROT_NONE. - */ -TEST_F(hmm, anon_read_prot) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Initialize mirror buffer so we can verify it isn't written. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = -i; - - /* Protect buffer from reading. */ - ret = mprotect(buffer->ptr, size, PROT_NONE); - ASSERT_EQ(ret, 0); - - /* Simulate a device reading system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); - ASSERT_EQ(ret, -EFAULT); - - /* Allow CPU to read the buffer so we can check it. */ - ret = mprotect(buffer->ptr, size, PROT_READ); - ASSERT_EQ(ret, 0); - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - hmm_buffer_free(buffer); -} - -/* - * Write private anonymous memory. - */ -TEST_F(hmm, anon_write) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Write private anonymous memory which has been protected with - * mprotect() PROT_READ. - */ -TEST_F(hmm, anon_write_prot) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Simulate a device reading a zero page of memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, 1); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, 1); - ASSERT_EQ(buffer->faults, 1); - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, -EPERM); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], 0); - - /* Now allow writing and see that the zero page is replaced. */ - ret = mprotect(buffer->ptr, size, PROT_WRITE | PROT_READ); - ASSERT_EQ(ret, 0); - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Check that a device writing an anonymous private mapping - * will copy-on-write if a child process inherits the mapping. - */ -TEST_F(hmm, anon_write_child) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - pid_t pid; - int child_fd; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer->ptr so we can tell if it is written. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = -i; - - pid = fork(); - if (pid == -1) - ASSERT_EQ(pid, 0); - if (pid != 0) { - waitpid(pid, &ret, 0); - ASSERT_EQ(WIFEXITED(ret), 1); - - /* Check that the parent's buffer did not change. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - return; - } - - /* Check that we see the parent's values. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - /* The child process needs its own mirror to its own mm. */ - child_fd = hmm_open(0); - ASSERT_GE(child_fd, 0); - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - close(child_fd); - exit(0); -} - -/* - * Check that a device writing an anonymous shared mapping - * will not copy-on-write if a child process inherits the mapping. - */ -TEST_F(hmm, anon_write_child_shared) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - pid_t pid; - int child_fd; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer->ptr so we can tell if it is written. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = -i; - - pid = fork(); - if (pid == -1) - ASSERT_EQ(pid, 0); - if (pid != 0) { - waitpid(pid, &ret, 0); - ASSERT_EQ(WIFEXITED(ret), 1); - - /* Check that the parent's buffer did change. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - return; - } - - /* Check that we see the parent's values. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - /* The child process needs its own mirror to its own mm. */ - child_fd = hmm_open(0); - ASSERT_GE(child_fd, 0); - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - close(child_fd); - exit(0); -} - -/* - * Write private anonymous huge page. - */ -TEST_F(hmm, anon_write_huge) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - void *old_ptr; - void *map; - int *ptr; - int ret; - - size = 2 * TWOMEG; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - size = TWOMEG; - npages = size >> self->page_shift; - map = (void *)ALIGN((uintptr_t)buffer->ptr, size); - ret = madvise(map, size, MADV_HUGEPAGE); - ASSERT_EQ(ret, 0); - old_ptr = buffer->ptr; - buffer->ptr = map; - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - buffer->ptr = old_ptr; - hmm_buffer_free(buffer); -} - -/* - * Read numeric data from raw and tagged kernel status files. Used to read - * /proc and /sys data (without a tag) and from /proc/meminfo (with a tag). - */ -static long file_read_ulong(char *file, const char *tag) -{ - int fd; - char buf[2048]; - int len; - char *p, *q; - long val; - - fd = open(file, O_RDONLY); - if (fd < 0) { - /* Error opening the file */ - return -1; - } - - len = read(fd, buf, sizeof(buf)); - close(fd); - if (len < 0) { - /* Error in reading the file */ - return -1; - } - if (len == sizeof(buf)) { - /* Error file is too large */ - return -1; - } - buf[len] = '\0'; - - /* Search for a tag if provided */ - if (tag) { - p = strstr(buf, tag); - if (!p) - return -1; /* looks like the line we want isn't there */ - p += strlen(tag); - } else - p = buf; - - val = strtol(p, &q, 0); - if (*q != ' ') { - /* Error parsing the file */ - return -1; - } - - return val; -} - -/* - * Write huge TLBFS page. - */ -TEST_F(hmm, anon_write_hugetlbfs) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long default_hsize; - unsigned long i; - int *ptr; - int ret; - - default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:"); - if (default_hsize < 0 || default_hsize*1024 < default_hsize) - SKIP(return, "Huge page size could not be determined"); - default_hsize = default_hsize*1024; /* KB to B */ - - size = ALIGN(TWOMEG, default_hsize); - npages = size >> self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, - -1, 0); - if (buffer->ptr == MAP_FAILED) { - free(buffer); - SKIP(return, "Huge page could not be allocated"); - } - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - munmap(buffer->ptr, buffer->size); - buffer->ptr = NULL; - hmm_buffer_free(buffer); -} - -/* - * Read mmap'ed file memory. - */ -TEST_F(hmm, file_read) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - int fd; - ssize_t len; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - fd = hmm_create_file(size); - ASSERT_GE(fd, 0); - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = fd; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - /* Write initial contents of the file. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - len = pwrite(fd, buffer->mirror, size, 0); - ASSERT_EQ(len, size); - memset(buffer->mirror, 0, size); - - buffer->ptr = mmap(NULL, size, - PROT_READ, - MAP_SHARED, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Simulate a device reading system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Write mmap'ed file memory. - */ -TEST_F(hmm, file_write) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - int fd; - ssize_t len; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - fd = hmm_create_file(size); - ASSERT_GE(fd, 0); - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = fd; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_SHARED, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Check that the device also wrote the file. */ - len = pread(fd, buffer->mirror, size, 0); - ASSERT_EQ(len, size); - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Migrate anonymous memory to device private memory. - */ -TEST_F(hmm, migrate) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Migrate memory to device. */ - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Migrate anonymous memory to device private memory and fault some of it back - * to system memory, then try migrating the resulting mix of system and device - * private memory to the device. - */ -TEST_F(hmm, migrate_fault) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Migrate memory to device. */ - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Fault half the pages back to system memory and check them. */ - for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i) - ASSERT_EQ(ptr[i], i); - - /* Migrate memory to the device again. */ - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -TEST_F(hmm, migrate_release) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Migrate memory to device. */ - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Release device memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_RELEASE, buffer, npages); - ASSERT_EQ(ret, 0); - - /* Fault pages back to system memory and check them. */ - for (i = 0, ptr = buffer->ptr; i < size / (2 * sizeof(*ptr)); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Migrate anonymous shared memory to device private memory. - */ -TEST_F(hmm, migrate_shared) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Migrate memory to device. */ - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, -ENOENT); - - hmm_buffer_free(buffer); -} - -/* - * Try to migrate various memory types to device private memory. - */ -TEST_F(hmm2, migrate_mixed) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - int *ptr; - unsigned char *p; - int ret; - int val; - - npages = 6; - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - /* Reserve a range of addresses. */ - buffer->ptr = mmap(NULL, size, - PROT_NONE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - p = buffer->ptr; - - /* Migrating a protected area should be an error. */ - ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages); - ASSERT_EQ(ret, -EINVAL); - - /* Punch a hole after the first page address. */ - ret = munmap(buffer->ptr + self->page_size, self->page_size); - ASSERT_EQ(ret, 0); - - /* We expect an error if the vma doesn't cover the range. */ - ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 3); - ASSERT_EQ(ret, -EINVAL); - - /* Page 2 will be a read-only zero page. */ - ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size, - PROT_READ); - ASSERT_EQ(ret, 0); - ptr = (int *)(buffer->ptr + 2 * self->page_size); - val = *ptr + 3; - ASSERT_EQ(val, 3); - - /* Page 3 will be read-only. */ - ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, - PROT_READ | PROT_WRITE); - ASSERT_EQ(ret, 0); - ptr = (int *)(buffer->ptr + 3 * self->page_size); - *ptr = val; - ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, - PROT_READ); - ASSERT_EQ(ret, 0); - - /* Page 4-5 will be read-write. */ - ret = mprotect(buffer->ptr + 4 * self->page_size, 2 * self->page_size, - PROT_READ | PROT_WRITE); - ASSERT_EQ(ret, 0); - ptr = (int *)(buffer->ptr + 4 * self->page_size); - *ptr = val; - ptr = (int *)(buffer->ptr + 5 * self->page_size); - *ptr = val; - - /* Now try to migrate pages 2-5 to device 1. */ - buffer->ptr = p + 2 * self->page_size; - ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 4); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, 4); - - /* Page 5 won't be migrated to device 0 because it's on device 1. */ - buffer->ptr = p + 5 * self->page_size; - ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1); - ASSERT_EQ(ret, -ENOENT); - buffer->ptr = p; - - buffer->ptr = p; - hmm_buffer_free(buffer); -} - -/* - * Migrate anonymous memory to device memory and back to system memory - * multiple times. In case of private zone configuration, this is done - * through fault pages accessed by CPU. In case of coherent zone configuration, - * the pages from the device should be explicitly migrated back to system memory. - * The reason is Coherent device zone has coherent access by CPU, therefore - * it will not generate any page fault. - */ -TEST_F(hmm, migrate_multiple) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - unsigned long c; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - for (c = 0; c < NTIMES; c++) { - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Migrate memory to device. */ - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Migrate back to system memory and check them. */ - if (hmm_is_coherent_type(variant->device_number)) { - ret = hmm_migrate_dev_to_sys(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - } - - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); - } -} - -/* - * Read anonymous memory multiple times. - */ -TEST_F(hmm, anon_read_multiple) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - unsigned long c; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - for (c = 0; c < NTIMES; c++) { - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i + c; - - /* Simulate a device reading system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, - npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i + c); - - hmm_buffer_free(buffer); - } -} - -void *unmap_buffer(void *p) -{ - struct hmm_buffer *buffer = p; - - /* Delay for a bit and then unmap buffer while it is being read. */ - hmm_nanosleep(hmm_random() % 32000); - munmap(buffer->ptr + buffer->size / 2, buffer->size / 2); - buffer->ptr = NULL; - - return NULL; -} - -/* - * Try reading anonymous memory while it is being unmapped. - */ -TEST_F(hmm, anon_teardown) -{ - unsigned long npages; - unsigned long size; - unsigned long c; - void *ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - for (c = 0; c < NTIMES; ++c) { - pthread_t thread; - struct hmm_buffer *buffer; - unsigned long i; - int *ptr; - int rc; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i + c; - - rc = pthread_create(&thread, NULL, unmap_buffer, buffer); - ASSERT_EQ(rc, 0); - - /* Simulate a device reading system memory. */ - rc = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_READ, buffer, - npages); - if (rc == 0) { - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; - i < size / sizeof(*ptr); - ++i) - ASSERT_EQ(ptr[i], i + c); - } - - pthread_join(thread, &ret); - hmm_buffer_free(buffer); - } -} - -/* - * Test memory snapshot without faulting in pages accessed by the device. - */ -TEST_F(hmm, mixedmap) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned char *m; - int ret; - - npages = 1; - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(npages); - ASSERT_NE(buffer->mirror, NULL); - - - /* Reserve a range of addresses. */ - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE, - self->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Simulate a device snapshotting CPU pagetables. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device saw. */ - m = buffer->mirror; - ASSERT_EQ(m[0], HMM_DMIRROR_PROT_READ); - - hmm_buffer_free(buffer); -} - -/* - * Test memory snapshot without faulting in pages accessed by the device. - */ -TEST_F(hmm2, snapshot) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - int *ptr; - unsigned char *p; - unsigned char *m; - int ret; - int val; - - npages = 7; - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(npages); - ASSERT_NE(buffer->mirror, NULL); - - /* Reserve a range of addresses. */ - buffer->ptr = mmap(NULL, size, - PROT_NONE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - p = buffer->ptr; - - /* Punch a hole after the first page address. */ - ret = munmap(buffer->ptr + self->page_size, self->page_size); - ASSERT_EQ(ret, 0); - - /* Page 2 will be read-only zero page. */ - ret = mprotect(buffer->ptr + 2 * self->page_size, self->page_size, - PROT_READ); - ASSERT_EQ(ret, 0); - ptr = (int *)(buffer->ptr + 2 * self->page_size); - val = *ptr + 3; - ASSERT_EQ(val, 3); - - /* Page 3 will be read-only. */ - ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, - PROT_READ | PROT_WRITE); - ASSERT_EQ(ret, 0); - ptr = (int *)(buffer->ptr + 3 * self->page_size); - *ptr = val; - ret = mprotect(buffer->ptr + 3 * self->page_size, self->page_size, - PROT_READ); - ASSERT_EQ(ret, 0); - - /* Page 4-6 will be read-write. */ - ret = mprotect(buffer->ptr + 4 * self->page_size, 3 * self->page_size, - PROT_READ | PROT_WRITE); - ASSERT_EQ(ret, 0); - ptr = (int *)(buffer->ptr + 4 * self->page_size); - *ptr = val; - - /* Page 5 will be migrated to device 0. */ - buffer->ptr = p + 5 * self->page_size; - ret = hmm_migrate_sys_to_dev(self->fd0, buffer, 1); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, 1); - - /* Page 6 will be migrated to device 1. */ - buffer->ptr = p + 6 * self->page_size; - ret = hmm_migrate_sys_to_dev(self->fd1, buffer, 1); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, 1); - - /* Simulate a device snapshotting CPU pagetables. */ - buffer->ptr = p; - ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_SNAPSHOT, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device saw. */ - m = buffer->mirror; - ASSERT_EQ(m[0], HMM_DMIRROR_PROT_ERROR); - ASSERT_EQ(m[1], HMM_DMIRROR_PROT_ERROR); - ASSERT_EQ(m[2], HMM_DMIRROR_PROT_ZERO | HMM_DMIRROR_PROT_READ); - ASSERT_EQ(m[3], HMM_DMIRROR_PROT_READ); - ASSERT_EQ(m[4], HMM_DMIRROR_PROT_WRITE); - if (!hmm_is_coherent_type(variant->device_number0)) { - ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL | - HMM_DMIRROR_PROT_WRITE); - ASSERT_EQ(m[6], HMM_DMIRROR_PROT_NONE); - } else { - ASSERT_EQ(m[5], HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | - HMM_DMIRROR_PROT_WRITE); - ASSERT_EQ(m[6], HMM_DMIRROR_PROT_DEV_COHERENT_REMOTE | - HMM_DMIRROR_PROT_WRITE); - } - - hmm_buffer_free(buffer); -} - -/* - * Test the hmm_range_fault() HMM_PFN_PMD flag for large pages that - * should be mapped by a large page table entry. - */ -TEST_F(hmm, compound) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long default_hsize; - int *ptr; - unsigned char *m; - int ret; - unsigned long i; - - /* Skip test if we can't allocate a hugetlbfs page. */ - - default_hsize = file_read_ulong("/proc/meminfo", "Hugepagesize:"); - if (default_hsize < 0 || default_hsize*1024 < default_hsize) - SKIP(return, "Huge page size could not be determined"); - default_hsize = default_hsize*1024; /* KB to B */ - - size = ALIGN(TWOMEG, default_hsize); - npages = size >> self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, - -1, 0); - if (buffer->ptr == MAP_FAILED) { - free(buffer); - return; - } - - buffer->size = size; - buffer->mirror = malloc(npages); - ASSERT_NE(buffer->mirror, NULL); - - /* Initialize the pages the device will snapshot in buffer->ptr. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Simulate a device snapshotting CPU pagetables. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device saw. */ - m = buffer->mirror; - for (i = 0; i < npages; ++i) - ASSERT_EQ(m[i], HMM_DMIRROR_PROT_WRITE | - HMM_DMIRROR_PROT_PMD); - - /* Make the region read-only. */ - ret = mprotect(buffer->ptr, size, PROT_READ); - ASSERT_EQ(ret, 0); - - /* Simulate a device snapshotting CPU pagetables. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device saw. */ - m = buffer->mirror; - for (i = 0; i < npages; ++i) - ASSERT_EQ(m[i], HMM_DMIRROR_PROT_READ | - HMM_DMIRROR_PROT_PMD); - - munmap(buffer->ptr, buffer->size); - buffer->ptr = NULL; - hmm_buffer_free(buffer); -} - -/* - * Test two devices reading the same memory (double mapped). - */ -TEST_F(hmm2, double_map) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = 6; - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(npages); - ASSERT_NE(buffer->mirror, NULL); - - /* Reserve a range of addresses. */ - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Make region read-only. */ - ret = mprotect(buffer->ptr, size, PROT_READ); - ASSERT_EQ(ret, 0); - - /* Simulate device 0 reading system memory. */ - ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Simulate device 1 reading system memory. */ - ret = hmm_dmirror_cmd(self->fd1, HMM_DMIRROR_READ, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Migrate pages to device 1 and try to read from device 0. */ - ret = hmm_migrate_sys_to_dev(self->fd1, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - ret = hmm_dmirror_cmd(self->fd0, HMM_DMIRROR_READ, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what device 0 read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - hmm_buffer_free(buffer); -} - -/* - * Basic check of exclusive faulting. - */ -TEST_F(hmm, exclusive) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Map memory exclusively for device access. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - /* Fault pages back to system memory and check them. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i]++, i); - - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i+1); - - /* Check atomic access revoked */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_CHECK_EXCLUSIVE, buffer, npages); - ASSERT_EQ(ret, 0); - - hmm_buffer_free(buffer); -} - -TEST_F(hmm, exclusive_mprotect) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Map memory exclusively for device access. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - ret = mprotect(buffer->ptr, size, PROT_READ); - ASSERT_EQ(ret, 0); - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, -EPERM); - - hmm_buffer_free(buffer); -} - -/* - * Check copy-on-write works. - */ -TEST_F(hmm, exclusive_cow) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Map memory exclusively for device access. */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_EXCLUSIVE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - fork(); - - /* Fault pages back to system memory and check them. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i]++, i); - - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i+1); - - hmm_buffer_free(buffer); -} - -static int gup_test_exec(int gup_fd, unsigned long addr, int cmd, - int npages, int size, int flags) -{ - struct gup_test gup = { - .nr_pages_per_call = npages, - .addr = addr, - .gup_flags = FOLL_WRITE | flags, - .size = size, - }; - - if (ioctl(gup_fd, cmd, &gup)) { - perror("ioctl on error\n"); - return errno; - } - - return 0; -} - -/* - * Test get user device pages through gup_test. Setting PIN_LONGTERM flag. - * This should trigger a migration back to system memory for both, private - * and coherent type pages. - * This test makes use of gup_test module. Make sure GUP_TEST_CONFIG is added - * to your configuration before you run it. - */ -TEST_F(hmm, hmm_gup_test) -{ - struct hmm_buffer *buffer; - int gup_fd; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - unsigned char *m; - - gup_fd = open("/sys/kernel/debug/gup_test", O_RDWR); - if (gup_fd == -1) - SKIP(return, "Skipping test, could not find gup_test driver"); - - npages = 4; - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Migrate memory to device. */ - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - /* Check what the device read. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - ASSERT_EQ(gup_test_exec(gup_fd, - (unsigned long)buffer->ptr, - GUP_BASIC_TEST, 1, self->page_size, 0), 0); - ASSERT_EQ(gup_test_exec(gup_fd, - (unsigned long)buffer->ptr + 1 * self->page_size, - GUP_FAST_BENCHMARK, 1, self->page_size, 0), 0); - ASSERT_EQ(gup_test_exec(gup_fd, - (unsigned long)buffer->ptr + 2 * self->page_size, - PIN_FAST_BENCHMARK, 1, self->page_size, FOLL_LONGTERM), 0); - ASSERT_EQ(gup_test_exec(gup_fd, - (unsigned long)buffer->ptr + 3 * self->page_size, - PIN_LONGTERM_BENCHMARK, 1, self->page_size, 0), 0); - - /* Take snapshot to CPU pagetables */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - m = buffer->mirror; - if (hmm_is_coherent_type(variant->device_number)) { - ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[0]); - ASSERT_EQ(HMM_DMIRROR_PROT_DEV_COHERENT_LOCAL | HMM_DMIRROR_PROT_WRITE, m[1]); - } else { - ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[0]); - ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[1]); - } - ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[2]); - ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[3]); - /* - * Check again the content on the pages. Make sure there's no - * corrupted data. - */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - - close(gup_fd); - hmm_buffer_free(buffer); -} - -/* - * Test copy-on-write in device pages. - * In case of writing to COW private page(s), a page fault will migrate pages - * back to system memory first. Then, these pages will be duplicated. In case - * of COW device coherent type, pages are duplicated directly from device - * memory. - */ -TEST_F(hmm, hmm_cow_in_device) -{ - struct hmm_buffer *buffer; - unsigned long npages; - unsigned long size; - unsigned long i; - int *ptr; - int ret; - unsigned char *m; - pid_t pid; - int status; - - npages = 4; - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer in system memory. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Migrate memory to device. */ - - ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - - pid = fork(); - if (pid == -1) - ASSERT_EQ(pid, 0); - if (!pid) { - /* Child process waitd for SIGTERM from the parent. */ - while (1) { - } - perror("Should not reach this\n"); - exit(0); - } - /* Parent process writes to COW pages(s) and gets a - * new copy in system. In case of device private pages, - * this write causes a migration to system mem first. - */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Terminate child and wait */ - EXPECT_EQ(0, kill(pid, SIGTERM)); - EXPECT_EQ(pid, waitpid(pid, &status, 0)); - EXPECT_NE(0, WIFSIGNALED(status)); - EXPECT_EQ(SIGTERM, WTERMSIG(status)); - - /* Take snapshot to CPU pagetables */ - ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - m = buffer->mirror; - for (i = 0; i < npages; i++) - ASSERT_EQ(HMM_DMIRROR_PROT_WRITE, m[i]); - - hmm_buffer_free(buffer); -} -TEST_HARNESS_MAIN --- a/tools/testing/selftests/vm/hugepage-mmap.c +++ /dev/null @@ -1,91 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * hugepage-mmap: - * - * Example of using huge page memory in a user application using the mmap - * system call. Before running this application, make sure that the - * administrator has mounted the hugetlbfs filesystem (on some directory - * like /mnt) using the command mount -t hugetlbfs nodev /mnt. In this - * example, the app is requesting memory of size 256MB that is backed by - * huge pages. - * - * For the ia64 architecture, the Linux kernel reserves Region number 4 for - * huge pages. That means that if one requires a fixed address, a huge page - * aligned address starting with 0x800000... will be required. If a fixed - * address is not required, the kernel will select an address in the proper - * range. - * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. - */ -#define _GNU_SOURCE -#include <stdlib.h> -#include <stdio.h> -#include <unistd.h> -#include <sys/mman.h> -#include <fcntl.h> - -#define LENGTH (256UL*1024*1024) -#define PROTECTION (PROT_READ | PROT_WRITE) - -/* Only ia64 requires this */ -#ifdef __ia64__ -#define ADDR (void *)(0x8000000000000000UL) -#define FLAGS (MAP_SHARED | MAP_FIXED) -#else -#define ADDR (void *)(0x0UL) -#define FLAGS (MAP_SHARED) -#endif - -static void check_bytes(char *addr) -{ - printf("First hex is %x\n", *((unsigned int *)addr)); -} - -static void write_bytes(char *addr) -{ - unsigned long i; - - for (i = 0; i < LENGTH; i++) - *(addr + i) = (char)i; -} - -static int read_bytes(char *addr) -{ - unsigned long i; - - check_bytes(addr); - for (i = 0; i < LENGTH; i++) - if (*(addr + i) != (char)i) { - printf("Mismatch at %lu\n", i); - return 1; - } - return 0; -} - -int main(void) -{ - void *addr; - int fd, ret; - - fd = memfd_create("hugepage-mmap", MFD_HUGETLB); - if (fd < 0) { - perror("memfd_create() failed"); - exit(1); - } - - addr = mmap(ADDR, LENGTH, PROTECTION, FLAGS, fd, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - close(fd); - exit(1); - } - - printf("Returned address is %p\n", addr); - check_bytes(addr); - write_bytes(addr); - ret = read_bytes(addr); - - munmap(addr, LENGTH); - close(fd); - - return ret; -} --- a/tools/testing/selftests/vm/hugepage-mremap.c +++ /dev/null @@ -1,188 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * hugepage-mremap: - * - * Example of remapping huge page memory in a user application using the - * mremap system call. The path to a file in a hugetlbfs filesystem must - * be passed as the last argument to this test. The amount of memory used - * by this test in MBs can optionally be passed as an argument. If no memory - * amount is passed, the default amount is 10MB. - * - * To make sure the test triggers pmd sharing and goes through the 'unshare' - * path in the mremap code use 1GB (1024) or more. - */ - -#define _GNU_SOURCE -#include <stdlib.h> -#include <stdio.h> -#include <unistd.h> -#include <sys/mman.h> -#include <errno.h> -#include <fcntl.h> /* Definition of O_* constants */ -#include <sys/syscall.h> /* Definition of SYS_* constants */ -#include <linux/userfaultfd.h> -#include <sys/ioctl.h> -#include <string.h> - -#define DEFAULT_LENGTH_MB 10UL -#define MB_TO_BYTES(x) (x * 1024 * 1024) - -#define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC) -#define FLAGS (MAP_SHARED | MAP_ANONYMOUS) - -static void check_bytes(char *addr) -{ - printf("First hex is %x\n", *((unsigned int *)addr)); -} - -static void write_bytes(char *addr, size_t len) -{ - unsigned long i; - - for (i = 0; i < len; i++) - *(addr + i) = (char)i; -} - -static int read_bytes(char *addr, size_t len) -{ - unsigned long i; - - check_bytes(addr); - for (i = 0; i < len; i++) - if (*(addr + i) != (char)i) { - printf("Mismatch at %lu\n", i); - return 1; - } - return 0; -} - -static void register_region_with_uffd(char *addr, size_t len) -{ - long uffd; /* userfaultfd file descriptor */ - struct uffdio_api uffdio_api; - struct uffdio_register uffdio_register; - - /* Create and enable userfaultfd object. */ - - uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); - if (uffd == -1) { - perror("userfaultfd"); - exit(1); - } - - uffdio_api.api = UFFD_API; - uffdio_api.features = 0; - if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { - perror("ioctl-UFFDIO_API"); - exit(1); - } - - /* Create a private anonymous mapping. The memory will be - * demand-zero paged--that is, not yet allocated. When we - * actually touch the memory, it will be allocated via - * the userfaultfd. - */ - - addr = mmap(NULL, len, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - - printf("Address returned by mmap() = %p\n", addr); - - /* Register the memory range of the mapping we just created for - * handling by the userfaultfd object. In mode, we request to track - * missing pages (i.e., pages that have not yet been faulted in). - */ - - uffdio_register.range.start = (unsigned long)addr; - uffdio_register.range.len = len; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { - perror("ioctl-UFFDIO_REGISTER"); - exit(1); - } -} - -int main(int argc, char *argv[]) -{ - size_t length = 0; - int ret = 0, fd; - - if (argc >= 2 && !strcmp(argv[1], "-h")) { - printf("Usage: %s [length_in_MB]\n", argv[0]); - exit(1); - } - - /* Read memory length as the first arg if valid, otherwise fallback to - * the default length. - */ - if (argc >= 2) - length = (size_t)atoi(argv[1]); - else - length = DEFAULT_LENGTH_MB; - - length = MB_TO_BYTES(length); - fd = memfd_create(argv[0], MFD_HUGETLB); - if (fd < 0) { - perror("Open failed"); - exit(1); - } - - /* mmap to a PUD aligned address to hopefully trigger pmd sharing. */ - unsigned long suggested_addr = 0x7eaa40000000; - void *haddr = mmap((void *)suggested_addr, length, PROTECTION, - MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); - printf("Map haddr: Returned address is %p\n", haddr); - if (haddr == MAP_FAILED) { - perror("mmap1"); - exit(1); - } - - /* mmap again to a dummy address to hopefully trigger pmd sharing. */ - suggested_addr = 0x7daa40000000; - void *daddr = mmap((void *)suggested_addr, length, PROTECTION, - MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); - printf("Map daddr: Returned address is %p\n", daddr); - if (daddr == MAP_FAILED) { - perror("mmap3"); - exit(1); - } - - suggested_addr = 0x7faa40000000; - void *vaddr = - mmap((void *)suggested_addr, length, PROTECTION, FLAGS, -1, 0); - printf("Map vaddr: Returned address is %p\n", vaddr); - if (vaddr == MAP_FAILED) { - perror("mmap2"); - exit(1); - } - - register_region_with_uffd(haddr, length); - - void *addr = mremap(haddr, length, length, - MREMAP_MAYMOVE | MREMAP_FIXED, vaddr); - if (addr == MAP_FAILED) { - perror("mremap"); - exit(1); - } - - printf("Mremap: Returned address is %p\n", addr); - check_bytes(addr); - write_bytes(addr, length); - ret = read_bytes(addr, length); - - munmap(addr, length); - - addr = mremap(addr, length, length, 0); - if (addr != MAP_FAILED) { - printf("mremap: Expected failure, but call succeeded\n"); - exit(1); - } - - close(fd); - - return ret; -} --- a/tools/testing/selftests/vm/hugepage-shm.c +++ /dev/null @@ -1,101 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * hugepage-shm: - * - * Example of using huge page memory in a user application using Sys V shared - * memory system calls. In this example the app is requesting 256MB of - * memory that is backed by huge pages. The application uses the flag - * SHM_HUGETLB in the shmget system call to inform the kernel that it is - * requesting huge pages. - * - * For the ia64 architecture, the Linux kernel reserves Region number 4 for - * huge pages. That means that if one requires a fixed address, a huge page - * aligned address starting with 0x800000... will be required. If a fixed - * address is not required, the kernel will select an address in the proper - * range. - * Other architectures, such as ppc64, i386 or x86_64 are not so constrained. - * - * Note: The default shared memory limit is quite low on many kernels, - * you may need to increase it via: - * - * echo 268435456 > /proc/sys/kernel/shmmax - * - * This will increase the maximum size per shared memory segment to 256MB. - * The other limit that you will hit eventually is shmall which is the - * total amount of shared memory in pages. To set it to 16GB on a system - * with a 4kB pagesize do: - * - * echo 4194304 > /proc/sys/kernel/shmall - */ - -#include <stdlib.h> -#include <stdio.h> -#include <sys/types.h> -#include <sys/ipc.h> -#include <sys/shm.h> -#include <sys/mman.h> - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - -#define LENGTH (256UL*1024*1024) - -#define dprintf(x) printf(x) - -/* Only ia64 requires this */ -#ifdef __ia64__ -#define ADDR (void *)(0x8000000000000000UL) -#define SHMAT_FLAGS (SHM_RND) -#else -#define ADDR (void *)(0x0UL) -#define SHMAT_FLAGS (0) -#endif - -int main(void) -{ - int shmid; - unsigned long i; - char *shmaddr; - - shmid = shmget(2, LENGTH, SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); - if (shmid < 0) { - perror("shmget"); - exit(1); - } - printf("shmid: 0x%x\n", shmid); - - shmaddr = shmat(shmid, ADDR, SHMAT_FLAGS); - if (shmaddr == (char *)-1) { - perror("Shared memory attach failure"); - shmctl(shmid, IPC_RMID, NULL); - exit(2); - } - printf("shmaddr: %p\n", shmaddr); - - dprintf("Starting the writes:\n"); - for (i = 0; i < LENGTH; i++) { - shmaddr[i] = (char)(i); - if (!(i % (1024 * 1024))) - dprintf("."); - } - dprintf("\n"); - - dprintf("Starting the Check..."); - for (i = 0; i < LENGTH; i++) - if (shmaddr[i] != (char)i) { - printf("\nIndex %lu mismatched\n", i); - exit(3); - } - dprintf("Done.\n"); - - if (shmdt((const void *)shmaddr) != 0) { - perror("Detach failure"); - shmctl(shmid, IPC_RMID, NULL); - exit(4); - } - - shmctl(shmid, IPC_RMID, NULL); - - return 0; -} --- a/tools/testing/selftests/vm/hugepage-vmemmap.c +++ /dev/null @@ -1,144 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * A test case of using hugepage memory in a user application using the - * mmap system call with MAP_HUGETLB flag. Before running this program - * make sure the administrator has allocated enough default sized huge - * pages to cover the 2 MB allocation. - */ -#include <stdlib.h> -#include <stdio.h> -#include <unistd.h> -#include <sys/mman.h> -#include <fcntl.h> - -#define MAP_LENGTH (2UL * 1024 * 1024) - -#ifndef MAP_HUGETLB -#define MAP_HUGETLB 0x40000 /* arch specific */ -#endif - -#define PAGE_SIZE 4096 - -#define PAGE_COMPOUND_HEAD (1UL << 15) -#define PAGE_COMPOUND_TAIL (1UL << 16) -#define PAGE_HUGE (1UL << 17) - -#define HEAD_PAGE_FLAGS (PAGE_COMPOUND_HEAD | PAGE_HUGE) -#define TAIL_PAGE_FLAGS (PAGE_COMPOUND_TAIL | PAGE_HUGE) - -#define PM_PFRAME_BITS 55 -#define PM_PFRAME_MASK ~((1UL << PM_PFRAME_BITS) - 1) - -/* - * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. - * That means the addresses starting with 0x800000... will need to be - * specified. Specifying a fixed address is not required on ppc64, i386 - * or x86_64. - */ -#ifdef __ia64__ -#define MAP_ADDR (void *)(0x8000000000000000UL) -#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) -#else -#define MAP_ADDR NULL -#define MAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) -#endif - -static void write_bytes(char *addr, size_t length) -{ - unsigned long i; - - for (i = 0; i < length; i++) - *(addr + i) = (char)i; -} - -static unsigned long virt_to_pfn(void *addr) -{ - int fd; - unsigned long pagemap; - - fd = open("/proc/self/pagemap", O_RDONLY); - if (fd < 0) - return -1UL; - - lseek(fd, (unsigned long)addr / PAGE_SIZE * sizeof(pagemap), SEEK_SET); - read(fd, &pagemap, sizeof(pagemap)); - close(fd); - - return pagemap & ~PM_PFRAME_MASK; -} - -static int check_page_flags(unsigned long pfn) -{ - int fd, i; - unsigned long pageflags; - - fd = open("/proc/kpageflags", O_RDONLY); - if (fd < 0) - return -1; - - lseek(fd, pfn * sizeof(pageflags), SEEK_SET); - - read(fd, &pageflags, sizeof(pageflags)); - if ((pageflags & HEAD_PAGE_FLAGS) != HEAD_PAGE_FLAGS) { - close(fd); - printf("Head page flags (%lx) is invalid\n", pageflags); - return -1; - } - - /* - * pages other than the first page must be tail and shouldn't be head; - * this also verifies kernel has correctly set the fake page_head to tail - * while hugetlb_free_vmemmap is enabled. - */ - for (i = 1; i < MAP_LENGTH / PAGE_SIZE; i++) { - read(fd, &pageflags, sizeof(pageflags)); - if ((pageflags & TAIL_PAGE_FLAGS) != TAIL_PAGE_FLAGS || - (pageflags & HEAD_PAGE_FLAGS) == HEAD_PAGE_FLAGS) { - close(fd); - printf("Tail page flags (%lx) is invalid\n", pageflags); - return -1; - } - } - - close(fd); - - return 0; -} - -int main(int argc, char **argv) -{ - void *addr; - unsigned long pfn; - - addr = mmap(MAP_ADDR, MAP_LENGTH, PROT_READ | PROT_WRITE, MAP_FLAGS, -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - - /* Trigger allocation of HugeTLB page. */ - write_bytes(addr, MAP_LENGTH); - - pfn = virt_to_pfn(addr); - if (pfn == -1UL) { - munmap(addr, MAP_LENGTH); - perror("virt_to_pfn"); - exit(1); - } - - printf("Returned address is %p whose pfn is %lx\n", addr, pfn); - - if (check_page_flags(pfn) < 0) { - munmap(addr, MAP_LENGTH); - perror("check_page_flags"); - exit(1); - } - - /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ - if (munmap(addr, MAP_LENGTH)) { - perror("munmap"); - exit(1); - } - - return 0; -} --- a/tools/testing/selftests/vm/hugetlb-madvise.c +++ /dev/null @@ -1,406 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * hugepage-madvise: - * - * Basic functional testing of madvise MADV_DONTNEED and MADV_REMOVE - * on hugetlb mappings. - * - * Before running this test, make sure the administrator has pre-allocated - * at least MIN_FREE_PAGES hugetlb pages and they are free. In addition, - * the test takes an argument that is the path to a file in a hugetlbfs - * filesystem. Therefore, a hugetlbfs filesystem must be mounted on some - * directory. - */ - -#define _GNU_SOURCE -#include <stdlib.h> -#include <stdio.h> -#include <unistd.h> -#include <sys/mman.h> -#define __USE_GNU -#include <fcntl.h> - -#define MIN_FREE_PAGES 20 -#define NR_HUGE_PAGES 10 /* common number of pages to map/allocate */ - -#define validate_free_pages(exp_free) \ - do { \ - int fhp = get_free_hugepages(); \ - if (fhp != (exp_free)) { \ - printf("Unexpected number of free huge " \ - "pages line %d\n", __LINE__); \ - exit(1); \ - } \ - } while (0) - -unsigned long huge_page_size; -unsigned long base_page_size; - -/* - * default_huge_page_size copied from mlock2-tests.c - */ -unsigned long default_huge_page_size(void) -{ - unsigned long hps = 0; - char *line = NULL; - size_t linelen = 0; - FILE *f = fopen("/proc/meminfo", "r"); - - if (!f) - return 0; - while (getline(&line, &linelen, f) > 0) { - if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { - hps <<= 10; - break; - } - } - - free(line); - fclose(f); - return hps; -} - -unsigned long get_free_hugepages(void) -{ - unsigned long fhp = 0; - char *line = NULL; - size_t linelen = 0; - FILE *f = fopen("/proc/meminfo", "r"); - - if (!f) - return fhp; - while (getline(&line, &linelen, f) > 0) { - if (sscanf(line, "HugePages_Free: %lu", &fhp) == 1) - break; - } - - free(line); - fclose(f); - return fhp; -} - -void write_fault_pages(void *addr, unsigned long nr_pages) -{ - unsigned long i; - - for (i = 0; i < nr_pages; i++) - *((unsigned long *)(addr + (i * huge_page_size))) = i; -} - -void read_fault_pages(void *addr, unsigned long nr_pages) -{ - unsigned long dummy = 0; - unsigned long i; - - for (i = 0; i < nr_pages; i++) - dummy += *((unsigned long *)(addr + (i * huge_page_size))); -} - -int main(int argc, char **argv) -{ - unsigned long free_hugepages; - void *addr, *addr2; - int fd; - int ret; - - huge_page_size = default_huge_page_size(); - if (!huge_page_size) { - printf("Unable to determine huge page size, exiting!\n"); - exit(1); - } - base_page_size = sysconf(_SC_PAGE_SIZE); - if (!huge_page_size) { - printf("Unable to determine base page size, exiting!\n"); - exit(1); - } - - free_hugepages = get_free_hugepages(); - if (free_hugepages < MIN_FREE_PAGES) { - printf("Not enough free huge pages to test, exiting!\n"); - exit(1); - } - - fd = memfd_create(argv[0], MFD_HUGETLB); - if (fd < 0) { - perror("memfd_create() failed"); - exit(1); - } - - /* - * Test validity of MADV_DONTNEED addr and length arguments. mmap - * size is NR_HUGE_PAGES + 2. One page at the beginning and end of - * the mapping will be unmapped so we KNOW there is nothing mapped - * there. - */ - addr = mmap(NULL, (NR_HUGE_PAGES + 2) * huge_page_size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, - -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - if (munmap(addr, huge_page_size) || - munmap(addr + (NR_HUGE_PAGES + 1) * huge_page_size, - huge_page_size)) { - perror("munmap"); - exit(1); - } - addr = addr + huge_page_size; - - write_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* addr before mapping should fail */ - ret = madvise(addr - base_page_size, NR_HUGE_PAGES * huge_page_size, - MADV_DONTNEED); - if (!ret) { - printf("Unexpected success of madvise call with invalid addr line %d\n", - __LINE__); - exit(1); - } - - /* addr + length after mapping should fail */ - ret = madvise(addr, (NR_HUGE_PAGES * huge_page_size) + base_page_size, - MADV_DONTNEED); - if (!ret) { - printf("Unexpected success of madvise call with invalid length line %d\n", - __LINE__); - exit(1); - } - - (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); - - /* - * Test alignment of MADV_DONTNEED addr and length arguments - */ - addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, - -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - write_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* addr is not huge page size aligned and should fail */ - ret = madvise(addr + base_page_size, - NR_HUGE_PAGES * huge_page_size - base_page_size, - MADV_DONTNEED); - if (!ret) { - printf("Unexpected success of madvise call with unaligned start address %d\n", - __LINE__); - exit(1); - } - - /* addr + length should be aligned down to huge page size */ - if (madvise(addr, - ((NR_HUGE_PAGES - 1) * huge_page_size) + base_page_size, - MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } - - /* should free all but last page in mapping */ - validate_free_pages(free_hugepages - 1); - - (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); - validate_free_pages(free_hugepages); - - /* - * Test MADV_DONTNEED on anonymous private mapping - */ - addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, - -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - write_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } - - /* should free all pages in mapping */ - validate_free_pages(free_hugepages); - - (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); - - /* - * Test MADV_DONTNEED on private mapping of hugetlb file - */ - if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { - perror("fallocate"); - exit(1); - } - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE, fd, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - - /* read should not consume any pages */ - read_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* madvise should not free any pages */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* writes should allocate private pages */ - write_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); - - /* madvise should free private pages */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* writes should allocate private pages */ - write_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); - - /* - * The fallocate below certainly should free the pages associated - * with the file. However, pages in the private mapping are also - * freed. This is not the 'correct' behavior, but is expected - * because this is how it has worked since the initial hugetlb - * implementation. - */ - if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - 0, NR_HUGE_PAGES * huge_page_size)) { - perror("fallocate"); - exit(1); - } - validate_free_pages(free_hugepages); - - (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); - - /* - * Test MADV_DONTNEED on shared mapping of hugetlb file - */ - if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { - perror("fallocate"); - exit(1); - } - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, - PROT_READ | PROT_WRITE, - MAP_SHARED, fd, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - - /* write should not consume any pages */ - write_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* madvise should not free any pages */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* - * Test MADV_REMOVE on shared mapping of hugetlb file - * - * madvise is same as hole punch and should free all pages. - */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) { - perror("madvise"); - exit(1); - } - validate_free_pages(free_hugepages); - (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); - - /* - * Test MADV_REMOVE on shared and private mapping of hugetlb file - */ - if (fallocate(fd, 0, 0, NR_HUGE_PAGES * huge_page_size)) { - perror("fallocate"); - exit(1); - } - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - addr = mmap(NULL, NR_HUGE_PAGES * huge_page_size, - PROT_READ | PROT_WRITE, - MAP_SHARED, fd, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - - /* shared write should not consume any additional pages */ - write_fault_pages(addr, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - addr2 = mmap(NULL, NR_HUGE_PAGES * huge_page_size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE, fd, 0); - if (addr2 == MAP_FAILED) { - perror("mmap"); - exit(1); - } - - /* private read should not consume any pages */ - read_fault_pages(addr2, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* private write should consume additional pages */ - write_fault_pages(addr2, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); - - /* madvise of shared mapping should not free any pages */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } - validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); - - /* madvise of private mapping should free private pages */ - if (madvise(addr2, NR_HUGE_PAGES * huge_page_size, MADV_DONTNEED)) { - perror("madvise"); - exit(1); - } - validate_free_pages(free_hugepages - NR_HUGE_PAGES); - - /* private write should consume additional pages again */ - write_fault_pages(addr2, NR_HUGE_PAGES); - validate_free_pages(free_hugepages - (2 * NR_HUGE_PAGES)); - - /* - * madvise should free both file and private pages although this is - * not correct. private pages should not be freed, but this is - * expected. See comment associated with FALLOC_FL_PUNCH_HOLE call. - */ - if (madvise(addr, NR_HUGE_PAGES * huge_page_size, MADV_REMOVE)) { - perror("madvise"); - exit(1); - } - validate_free_pages(free_hugepages); - - (void)munmap(addr, NR_HUGE_PAGES * huge_page_size); - (void)munmap(addr2, NR_HUGE_PAGES * huge_page_size); - - close(fd); - return 0; -} --- a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh +++ /dev/null @@ -1,252 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -set -e - -if [[ $(id -u) -ne 0 ]]; then - echo "This test must be run as root. Skipping..." - exit $ksft_skip -fi - -usage_file=usage_in_bytes - -if [[ "$1" == "-cgroup-v2" ]]; then - cgroup2=1 - usage_file=current -fi - - -if [[ $cgroup2 ]]; then - CGROUP_ROOT=$(mount -t cgroup2 | head -1 | awk -e '{print $3}') - if [[ -z "$CGROUP_ROOT" ]]; then - CGROUP_ROOT=/dev/cgroup/memory - mount -t cgroup2 none $CGROUP_ROOT - do_umount=1 - fi - echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control -else - CGROUP_ROOT=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}') - if [[ -z "$CGROUP_ROOT" ]]; then - CGROUP_ROOT=/dev/cgroup/memory - mount -t cgroup memory,hugetlb $CGROUP_ROOT - do_umount=1 - fi -fi -MNT='/mnt/huge/' - -function get_machine_hugepage_size() { - hpz=$(grep -i hugepagesize /proc/meminfo) - kb=${hpz:14:-3} - mb=$(($kb / 1024)) - echo $mb -} - -MB=$(get_machine_hugepage_size) - -function cleanup() { - echo cleanup - set +e - rm -rf "$MNT"/* 2>/dev/null - umount "$MNT" 2>/dev/null - rmdir "$MNT" 2>/dev/null - rmdir "$CGROUP_ROOT"/a/b 2>/dev/null - rmdir "$CGROUP_ROOT"/a 2>/dev/null - rmdir "$CGROUP_ROOT"/test1 2>/dev/null - echo 0 >/proc/sys/vm/nr_hugepages - set -e -} - -function assert_state() { - local expected_a="$1" - local expected_a_hugetlb="$2" - local expected_b="" - local expected_b_hugetlb="" - - if [ ! -z ${3:-} ] && [ ! -z ${4:-} ]; then - expected_b="$3" - expected_b_hugetlb="$4" - fi - local tolerance=$((5 * 1024 * 1024)) - - local actual_a - actual_a="$(cat "$CGROUP_ROOT"/a/memory.$usage_file)" - if [[ $actual_a -lt $(($expected_a - $tolerance)) ]] || - [[ $actual_a -gt $(($expected_a + $tolerance)) ]]; then - echo actual a = $((${actual_a%% *} / 1024 / 1024)) MB - echo expected a = $((${expected_a%% *} / 1024 / 1024)) MB - echo fail - - cleanup - exit 1 - fi - - local actual_a_hugetlb - actual_a_hugetlb="$(cat "$CGROUP_ROOT"/a/hugetlb.${MB}MB.$usage_file)" - if [[ $actual_a_hugetlb -lt $(($expected_a_hugetlb - $tolerance)) ]] || - [[ $actual_a_hugetlb -gt $(($expected_a_hugetlb + $tolerance)) ]]; then - echo actual a hugetlb = $((${actual_a_hugetlb%% *} / 1024 / 1024)) MB - echo expected a hugetlb = $((${expected_a_hugetlb%% *} / 1024 / 1024)) MB - echo fail - - cleanup - exit 1 - fi - - if [[ -z "$expected_b" || -z "$expected_b_hugetlb" ]]; then - return - fi - - local actual_b - actual_b="$(cat "$CGROUP_ROOT"/a/b/memory.$usage_file)" - if [[ $actual_b -lt $(($expected_b - $tolerance)) ]] || - [[ $actual_b -gt $(($expected_b + $tolerance)) ]]; then - echo actual b = $((${actual_b%% *} / 1024 / 1024)) MB - echo expected b = $((${expected_b%% *} / 1024 / 1024)) MB - echo fail - - cleanup - exit 1 - fi - - local actual_b_hugetlb - actual_b_hugetlb="$(cat "$CGROUP_ROOT"/a/b/hugetlb.${MB}MB.$usage_file)" - if [[ $actual_b_hugetlb -lt $(($expected_b_hugetlb - $tolerance)) ]] || - [[ $actual_b_hugetlb -gt $(($expected_b_hugetlb + $tolerance)) ]]; then - echo actual b hugetlb = $((${actual_b_hugetlb%% *} / 1024 / 1024)) MB - echo expected b hugetlb = $((${expected_b_hugetlb%% *} / 1024 / 1024)) MB - echo fail - - cleanup - exit 1 - fi -} - -function setup() { - echo 100 >/proc/sys/vm/nr_hugepages - mkdir "$CGROUP_ROOT"/a - sleep 1 - if [[ $cgroup2 ]]; then - echo "+hugetlb +memory" >$CGROUP_ROOT/a/cgroup.subtree_control - else - echo 0 >$CGROUP_ROOT/a/cpuset.mems - echo 0 >$CGROUP_ROOT/a/cpuset.cpus - fi - - mkdir "$CGROUP_ROOT"/a/b - - if [[ ! $cgroup2 ]]; then - echo 0 >$CGROUP_ROOT/a/b/cpuset.mems - echo 0 >$CGROUP_ROOT/a/b/cpuset.cpus - fi - - mkdir -p "$MNT" - mount -t hugetlbfs none "$MNT" -} - -write_hugetlbfs() { - local cgroup="$1" - local path="$2" - local size="$3" - - if [[ $cgroup2 ]]; then - echo $$ >$CGROUP_ROOT/$cgroup/cgroup.procs - else - echo 0 >$CGROUP_ROOT/$cgroup/cpuset.mems - echo 0 >$CGROUP_ROOT/$cgroup/cpuset.cpus - echo $$ >"$CGROUP_ROOT/$cgroup/tasks" - fi - ./write_to_hugetlbfs -p "$path" -s "$size" -m 0 -o - if [[ $cgroup2 ]]; then - echo $$ >$CGROUP_ROOT/cgroup.procs - else - echo $$ >"$CGROUP_ROOT/tasks" - fi - echo -} - -set -e - -size=$((${MB} * 1024 * 1024 * 25)) # 50MB = 25 * 2MB hugepages. - -cleanup - -echo -echo -echo Test charge, rmdir, uncharge -setup -echo mkdir -mkdir $CGROUP_ROOT/test1 - -echo write -write_hugetlbfs test1 "$MNT"/test $size - -echo rmdir -rmdir $CGROUP_ROOT/test1 -mkdir $CGROUP_ROOT/test1 - -echo uncharge -rm -rf /mnt/huge/* - -cleanup - -echo done -echo -echo -if [[ ! $cgroup2 ]]; then - echo "Test parent and child hugetlb usage" - setup - - echo write - write_hugetlbfs a "$MNT"/test $size - - echo Assert memory charged correctly for parent use. - assert_state 0 $size 0 0 - - write_hugetlbfs a/b "$MNT"/test2 $size - - echo Assert memory charged correctly for child use. - assert_state 0 $(($size * 2)) 0 $size - - rmdir "$CGROUP_ROOT"/a/b - sleep 5 - echo Assert memory reparent correctly. - assert_state 0 $(($size * 2)) - - rm -rf "$MNT"/* - umount "$MNT" - echo Assert memory uncharged correctly. - assert_state 0 0 - - cleanup -fi - -echo -echo -echo "Test child only hugetlb usage" -echo setup -setup - -echo write -write_hugetlbfs a/b "$MNT"/test2 $size - -echo Assert memory charged correctly for child only use. -assert_state 0 $(($size)) 0 $size - -rmdir "$CGROUP_ROOT"/a/b -echo Assert memory reparent correctly. -assert_state 0 $size - -rm -rf "$MNT"/* -umount "$MNT" -echo Assert memory uncharged correctly. -assert_state 0 0 - -cleanup - -echo ALL PASS - -umount $CGROUP_ROOT -rm -rf $CGROUP_ROOT --- a/tools/testing/selftests/vm/khugepaged.c +++ /dev/null @@ -1,1558 +0,0 @@ -#define _GNU_SOURCE -#include <ctype.h> -#include <errno.h> -#include <fcntl.h> -#include <limits.h> -#include <dirent.h> -#include <signal.h> -#include <stdio.h> -#include <stdlib.h> -#include <stdbool.h> -#include <string.h> -#include <unistd.h> - -#include <sys/mman.h> -#include <sys/wait.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <sys/sysmacros.h> -#include <sys/vfs.h> - -#include "linux/magic.h" - -#include "vm_util.h" - -#ifndef MADV_PAGEOUT -#define MADV_PAGEOUT 21 -#endif -#ifndef MADV_POPULATE_READ -#define MADV_POPULATE_READ 22 -#endif -#ifndef MADV_COLLAPSE -#define MADV_COLLAPSE 25 -#endif - -#define BASE_ADDR ((void *)(1UL << 30)) -static unsigned long hpage_pmd_size; -static unsigned long page_size; -static int hpage_pmd_nr; - -#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" -#define PID_SMAPS "/proc/self/smaps" -#define TEST_FILE "collapse_test_file" - -#define MAX_LINE_LENGTH 500 - -enum vma_type { - VMA_ANON, - VMA_FILE, - VMA_SHMEM, -}; - -struct mem_ops { - void *(*setup_area)(int nr_hpages); - void (*cleanup_area)(void *p, unsigned long size); - void (*fault)(void *p, unsigned long start, unsigned long end); - bool (*check_huge)(void *addr, int nr_hpages); - const char *name; -}; - -static struct mem_ops *file_ops; -static struct mem_ops *anon_ops; -static struct mem_ops *shmem_ops; - -struct collapse_context { - void (*collapse)(const char *msg, char *p, int nr_hpages, - struct mem_ops *ops, bool expect); - bool enforce_pte_scan_limits; - const char *name; -}; - -static struct collapse_context *khugepaged_context; -static struct collapse_context *madvise_context; - -struct file_info { - const char *dir; - char path[PATH_MAX]; - enum vma_type type; - int fd; - char dev_queue_read_ahead_path[PATH_MAX]; -}; - -static struct file_info finfo; - -enum thp_enabled { - THP_ALWAYS, - THP_MADVISE, - THP_NEVER, -}; - -static const char *thp_enabled_strings[] = { - "always", - "madvise", - "never", - NULL -}; - -enum thp_defrag { - THP_DEFRAG_ALWAYS, - THP_DEFRAG_DEFER, - THP_DEFRAG_DEFER_MADVISE, - THP_DEFRAG_MADVISE, - THP_DEFRAG_NEVER, -}; - -static const char *thp_defrag_strings[] = { - "always", - "defer", - "defer+madvise", - "madvise", - "never", - NULL -}; - -enum shmem_enabled { - SHMEM_ALWAYS, - SHMEM_WITHIN_SIZE, - SHMEM_ADVISE, - SHMEM_NEVER, - SHMEM_DENY, - SHMEM_FORCE, -}; - -static const char *shmem_enabled_strings[] = { - "always", - "within_size", - "advise", - "never", - "deny", - "force", - NULL -}; - -struct khugepaged_settings { - bool defrag; - unsigned int alloc_sleep_millisecs; - unsigned int scan_sleep_millisecs; - unsigned int max_ptes_none; - unsigned int max_ptes_swap; - unsigned int max_ptes_shared; - unsigned long pages_to_scan; -}; - -struct settings { - enum thp_enabled thp_enabled; - enum thp_defrag thp_defrag; - enum shmem_enabled shmem_enabled; - bool use_zero_page; - struct khugepaged_settings khugepaged; - unsigned long read_ahead_kb; -}; - -static struct settings saved_settings; -static bool skip_settings_restore; - -static int exit_status; - -static void success(const char *msg) -{ - printf(" \e[32m%s\e[0m\n", msg); -} - -static void fail(const char *msg) -{ - printf(" \e[31m%s\e[0m\n", msg); - exit_status++; -} - -static void skip(const char *msg) -{ - printf(" \e[33m%s\e[0m\n", msg); -} - -static int read_file(const char *path, char *buf, size_t buflen) -{ - int fd; - ssize_t numread; - - fd = open(path, O_RDONLY); - if (fd == -1) - return 0; - - numread = read(fd, buf, buflen - 1); - if (numread < 1) { - close(fd); - return 0; - } - - buf[numread] = '\0'; - close(fd); - - return (unsigned int) numread; -} - -static int write_file(const char *path, const char *buf, size_t buflen) -{ - int fd; - ssize_t numwritten; - - fd = open(path, O_WRONLY); - if (fd == -1) { - printf("open(%s)\n", path); - exit(EXIT_FAILURE); - return 0; - } - - numwritten = write(fd, buf, buflen - 1); - close(fd); - if (numwritten < 1) { - printf("write(%s)\n", buf); - exit(EXIT_FAILURE); - return 0; - } - - return (unsigned int) numwritten; -} - -static int read_string(const char *name, const char *strings[]) -{ - char path[PATH_MAX]; - char buf[256]; - char *c; - int ret; - - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - - if (!read_file(path, buf, sizeof(buf))) { - perror(path); - exit(EXIT_FAILURE); - } - - c = strchr(buf, '['); - if (!c) { - printf("%s: Parse failure\n", __func__); - exit(EXIT_FAILURE); - } - - c++; - memmove(buf, c, sizeof(buf) - (c - buf)); - - c = strchr(buf, ']'); - if (!c) { - printf("%s: Parse failure\n", __func__); - exit(EXIT_FAILURE); - } - *c = '\0'; - - ret = 0; - while (strings[ret]) { - if (!strcmp(strings[ret], buf)) - return ret; - ret++; - } - - printf("Failed to parse %s\n", name); - exit(EXIT_FAILURE); -} - -static void write_string(const char *name, const char *val) -{ - char path[PATH_MAX]; - int ret; - - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - - if (!write_file(path, val, strlen(val) + 1)) { - perror(path); - exit(EXIT_FAILURE); - } -} - -static const unsigned long _read_num(const char *path) -{ - char buf[21]; - - if (read_file(path, buf, sizeof(buf)) < 0) { - perror("read_file(read_num)"); - exit(EXIT_FAILURE); - } - - return strtoul(buf, NULL, 10); -} - -static const unsigned long read_num(const char *name) -{ - char path[PATH_MAX]; - int ret; - - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - return _read_num(path); -} - -static void _write_num(const char *path, unsigned long num) -{ - char buf[21]; - - sprintf(buf, "%ld", num); - if (!write_file(path, buf, strlen(buf) + 1)) { - perror(path); - exit(EXIT_FAILURE); - } -} - -static void write_num(const char *name, unsigned long num) -{ - char path[PATH_MAX]; - int ret; - - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - _write_num(path, num); -} - -static void write_settings(struct settings *settings) -{ - struct khugepaged_settings *khugepaged = &settings->khugepaged; - - write_string("enabled", thp_enabled_strings[settings->thp_enabled]); - write_string("defrag", thp_defrag_strings[settings->thp_defrag]); - write_string("shmem_enabled", - shmem_enabled_strings[settings->shmem_enabled]); - write_num("use_zero_page", settings->use_zero_page); - - write_num("khugepaged/defrag", khugepaged->defrag); - write_num("khugepaged/alloc_sleep_millisecs", - khugepaged->alloc_sleep_millisecs); - write_num("khugepaged/scan_sleep_millisecs", - khugepaged->scan_sleep_millisecs); - write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none); - write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap); - write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared); - write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); - - if (file_ops && finfo.type == VMA_FILE) - _write_num(finfo.dev_queue_read_ahead_path, - settings->read_ahead_kb); -} - -#define MAX_SETTINGS_DEPTH 4 -static struct settings settings_stack[MAX_SETTINGS_DEPTH]; -static int settings_index; - -static struct settings *current_settings(void) -{ - if (!settings_index) { - printf("Fail: No settings set"); - exit(EXIT_FAILURE); - } - return settings_stack + settings_index - 1; -} - -static void push_settings(struct settings *settings) -{ - if (settings_index >= MAX_SETTINGS_DEPTH) { - printf("Fail: Settings stack exceeded"); - exit(EXIT_FAILURE); - } - settings_stack[settings_index++] = *settings; - write_settings(current_settings()); -} - -static void pop_settings(void) -{ - if (settings_index <= 0) { - printf("Fail: Settings stack empty"); - exit(EXIT_FAILURE); - } - --settings_index; - write_settings(current_settings()); -} - -static void restore_settings(int sig) -{ - if (skip_settings_restore) - goto out; - - printf("Restore THP and khugepaged settings..."); - write_settings(&saved_settings); - success("OK"); - if (sig) - exit(EXIT_FAILURE); -out: - exit(exit_status); -} - -static void save_settings(void) -{ - printf("Save THP and khugepaged settings..."); - saved_settings = (struct settings) { - .thp_enabled = read_string("enabled", thp_enabled_strings), - .thp_defrag = read_string("defrag", thp_defrag_strings), - .shmem_enabled = - read_string("shmem_enabled", shmem_enabled_strings), - .use_zero_page = read_num("use_zero_page"), - }; - saved_settings.khugepaged = (struct khugepaged_settings) { - .defrag = read_num("khugepaged/defrag"), - .alloc_sleep_millisecs = - read_num("khugepaged/alloc_sleep_millisecs"), - .scan_sleep_millisecs = - read_num("khugepaged/scan_sleep_millisecs"), - .max_ptes_none = read_num("khugepaged/max_ptes_none"), - .max_ptes_swap = read_num("khugepaged/max_ptes_swap"), - .max_ptes_shared = read_num("khugepaged/max_ptes_shared"), - .pages_to_scan = read_num("khugepaged/pages_to_scan"), - }; - if (file_ops && finfo.type == VMA_FILE) - saved_settings.read_ahead_kb = - _read_num(finfo.dev_queue_read_ahead_path); - - success("OK"); - - signal(SIGTERM, restore_settings); - signal(SIGINT, restore_settings); - signal(SIGHUP, restore_settings); - signal(SIGQUIT, restore_settings); -} - -static void get_finfo(const char *dir) -{ - struct stat path_stat; - struct statfs fs; - char buf[1 << 10]; - char path[PATH_MAX]; - char *str, *end; - - finfo.dir = dir; - stat(finfo.dir, &path_stat); - if (!S_ISDIR(path_stat.st_mode)) { - printf("%s: Not a directory (%s)\n", __func__, finfo.dir); - exit(EXIT_FAILURE); - } - if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE, - finfo.dir) >= sizeof(finfo.path)) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - if (statfs(finfo.dir, &fs)) { - perror("statfs()"); - exit(EXIT_FAILURE); - } - finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE; - if (finfo.type == VMA_SHMEM) - return; - - /* Find owning device's queue/read_ahead_kb control */ - if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent", - major(path_stat.st_dev), minor(path_stat.st_dev)) - >= sizeof(path)) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - if (read_file(path, buf, sizeof(buf)) < 0) { - perror("read_file(read_num)"); - exit(EXIT_FAILURE); - } - if (strstr(buf, "DEVTYPE=disk")) { - /* Found it */ - if (snprintf(finfo.dev_queue_read_ahead_path, - sizeof(finfo.dev_queue_read_ahead_path), - "/sys/dev/block/%d:%d/queue/read_ahead_kb", - major(path_stat.st_dev), minor(path_stat.st_dev)) - >= sizeof(finfo.dev_queue_read_ahead_path)) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - return; - } - if (!strstr(buf, "DEVTYPE=partition")) { - printf("%s: Unknown device type: %s\n", __func__, path); - exit(EXIT_FAILURE); - } - /* - * Partition of block device - need to find actual device. - * Using naming convention that devnameN is partition of - * device devname. - */ - str = strstr(buf, "DEVNAME="); - if (!str) { - printf("%s: Could not read: %s", __func__, path); - exit(EXIT_FAILURE); - } - str += 8; - end = str; - while (*end) { - if (isdigit(*end)) { - *end = '\0'; - if (snprintf(finfo.dev_queue_read_ahead_path, - sizeof(finfo.dev_queue_read_ahead_path), - "/sys/block/%s/queue/read_ahead_kb", - str) >= sizeof(finfo.dev_queue_read_ahead_path)) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - return; - } - ++end; - } - printf("%s: Could not read: %s\n", __func__, path); - exit(EXIT_FAILURE); -} - -static bool check_swap(void *addr, unsigned long size) -{ - bool swap = false; - int ret; - FILE *fp; - char buffer[MAX_LINE_LENGTH]; - char addr_pattern[MAX_LINE_LENGTH]; - - ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", - (unsigned long) addr); - if (ret >= MAX_LINE_LENGTH) { - printf("%s: Pattern is too long\n", __func__); - exit(EXIT_FAILURE); - } - - - fp = fopen(PID_SMAPS, "r"); - if (!fp) { - printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); - exit(EXIT_FAILURE); - } - if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) - goto err_out; - - ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB", - size >> 10); - if (ret >= MAX_LINE_LENGTH) { - printf("%s: Pattern is too long\n", __func__); - exit(EXIT_FAILURE); - } - /* - * Fetch the Swap: in the same block and check whether it got - * the expected number of hugeepages next. - */ - if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer))) - goto err_out; - - if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) - goto err_out; - - swap = true; -err_out: - fclose(fp); - return swap; -} - -static void *alloc_mapping(int nr) -{ - void *p; - - p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (p != BASE_ADDR) { - printf("Failed to allocate VMA at %p\n", BASE_ADDR); - exit(EXIT_FAILURE); - } - - return p; -} - -static void fill_memory(int *p, unsigned long start, unsigned long end) -{ - int i; - - for (i = start / page_size; i < end / page_size; i++) - p[i * page_size / sizeof(*p)] = i + 0xdead0000; -} - -/* - * MADV_COLLAPSE is a best-effort request and may fail if an internal - * resource is temporarily unavailable, in which case it will set errno to - * EAGAIN. In such a case, immediately reattempt the operation one more - * time. - */ -static int madvise_collapse_retry(void *p, unsigned long size) -{ - bool retry = true; - int ret; - -retry: - ret = madvise(p, size, MADV_COLLAPSE); - if (ret && errno == EAGAIN && retry) { - retry = false; - goto retry; - } - return ret; -} - -/* - * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with - * validate_memory()'able contents. - */ -static void *alloc_hpage(struct mem_ops *ops) -{ - void *p = ops->setup_area(1); - - ops->fault(p, 0, hpage_pmd_size); - - /* - * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE. - * The latter is ineligible for collapse by MADV_COLLAPSE - * while the former might cause MADV_COLLAPSE to race with - * khugepaged on low-load system (like a test machine), which - * would cause MADV_COLLAPSE to fail with EAGAIN. - */ - printf("Allocate huge page..."); - if (madvise_collapse_retry(p, hpage_pmd_size)) { - perror("madvise(MADV_COLLAPSE)"); - exit(EXIT_FAILURE); - } - if (!ops->check_huge(p, 1)) { - perror("madvise(MADV_COLLAPSE)"); - exit(EXIT_FAILURE); - } - if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) { - perror("madvise(MADV_HUGEPAGE)"); - exit(EXIT_FAILURE); - } - success("OK"); - return p; -} - -static void validate_memory(int *p, unsigned long start, unsigned long end) -{ - int i; - - for (i = start / page_size; i < end / page_size; i++) { - if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) { - printf("Page %d is corrupted: %#x\n", - i, p[i * page_size / sizeof(*p)]); - exit(EXIT_FAILURE); - } - } -} - -static void *anon_setup_area(int nr_hpages) -{ - return alloc_mapping(nr_hpages); -} - -static void anon_cleanup_area(void *p, unsigned long size) -{ - munmap(p, size); -} - -static void anon_fault(void *p, unsigned long start, unsigned long end) -{ - fill_memory(p, start, end); -} - -static bool anon_check_huge(void *addr, int nr_hpages) -{ - return check_huge_anon(addr, nr_hpages, hpage_pmd_size); -} - -static void *file_setup_area(int nr_hpages) -{ - int fd; - void *p; - unsigned long size; - - unlink(finfo.path); /* Cleanup from previous failed tests */ - printf("Creating %s for collapse%s...", finfo.path, - finfo.type == VMA_SHMEM ? " (tmpfs)" : ""); - fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL, - 777); - if (fd < 0) { - perror("open()"); - exit(EXIT_FAILURE); - } - - size = nr_hpages * hpage_pmd_size; - p = alloc_mapping(nr_hpages); - fill_memory(p, 0, size); - write(fd, p, size); - close(fd); - munmap(p, size); - success("OK"); - - printf("Opening %s read only for collapse...", finfo.path); - finfo.fd = open(finfo.path, O_RDONLY, 777); - if (finfo.fd < 0) { - perror("open()"); - exit(EXIT_FAILURE); - } - p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC, - MAP_PRIVATE, finfo.fd, 0); - if (p == MAP_FAILED || p != BASE_ADDR) { - perror("mmap()"); - exit(EXIT_FAILURE); - } - - /* Drop page cache */ - write_file("/proc/sys/vm/drop_caches", "3", 2); - success("OK"); - return p; -} - -static void file_cleanup_area(void *p, unsigned long size) -{ - munmap(p, size); - close(finfo.fd); - unlink(finfo.path); -} - -static void file_fault(void *p, unsigned long start, unsigned long end) -{ - if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) { - perror("madvise(MADV_POPULATE_READ"); - exit(EXIT_FAILURE); - } -} - -static bool file_check_huge(void *addr, int nr_hpages) -{ - switch (finfo.type) { - case VMA_FILE: - return check_huge_file(addr, nr_hpages, hpage_pmd_size); - case VMA_SHMEM: - return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); - default: - exit(EXIT_FAILURE); - return false; - } -} - -static void *shmem_setup_area(int nr_hpages) -{ - void *p; - unsigned long size = nr_hpages * hpage_pmd_size; - - finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0); - if (finfo.fd < 0) { - perror("memfd_create()"); - exit(EXIT_FAILURE); - } - if (ftruncate(finfo.fd, size)) { - perror("ftruncate()"); - exit(EXIT_FAILURE); - } - p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd, - 0); - if (p != BASE_ADDR) { - perror("mmap()"); - exit(EXIT_FAILURE); - } - return p; -} - -static void shmem_cleanup_area(void *p, unsigned long size) -{ - munmap(p, size); - close(finfo.fd); -} - -static bool shmem_check_huge(void *addr, int nr_hpages) -{ - return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); -} - -static struct mem_ops __anon_ops = { - .setup_area = &anon_setup_area, - .cleanup_area = &anon_cleanup_area, - .fault = &anon_fault, - .check_huge = &anon_check_huge, - .name = "anon", -}; - -static struct mem_ops __file_ops = { - .setup_area = &file_setup_area, - .cleanup_area = &file_cleanup_area, - .fault = &file_fault, - .check_huge = &file_check_huge, - .name = "file", -}; - -static struct mem_ops __shmem_ops = { - .setup_area = &shmem_setup_area, - .cleanup_area = &shmem_cleanup_area, - .fault = &anon_fault, - .check_huge = &shmem_check_huge, - .name = "shmem", -}; - -static void __madvise_collapse(const char *msg, char *p, int nr_hpages, - struct mem_ops *ops, bool expect) -{ - int ret; - struct settings settings = *current_settings(); - - printf("%s...", msg); - - /* - * Prevent khugepaged interference and tests that MADV_COLLAPSE - * ignores /sys/kernel/mm/transparent_hugepage/enabled - */ - settings.thp_enabled = THP_NEVER; - settings.shmem_enabled = SHMEM_NEVER; - push_settings(&settings); - - /* Clear VM_NOHUGEPAGE */ - madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); - ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size); - if (((bool)ret) == expect) - fail("Fail: Bad return value"); - else if (!ops->check_huge(p, expect ? nr_hpages : 0)) - fail("Fail: check_huge()"); - else - success("OK"); - - pop_settings(); -} - -static void madvise_collapse(const char *msg, char *p, int nr_hpages, - struct mem_ops *ops, bool expect) -{ - /* Sanity check */ - if (!ops->check_huge(p, 0)) { - printf("Unexpected huge page\n"); - exit(EXIT_FAILURE); - } - __madvise_collapse(msg, p, nr_hpages, ops, expect); -} - -#define TICK 500000 -static bool wait_for_scan(const char *msg, char *p, int nr_hpages, - struct mem_ops *ops) -{ - int full_scans; - int timeout = 6; /* 3 seconds */ - - /* Sanity check */ - if (!ops->check_huge(p, 0)) { - printf("Unexpected huge page\n"); - exit(EXIT_FAILURE); - } - - madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); - - /* Wait until the second full_scan completed */ - full_scans = read_num("khugepaged/full_scans") + 2; - - printf("%s...", msg); - while (timeout--) { - if (ops->check_huge(p, nr_hpages)) - break; - if (read_num("khugepaged/full_scans") >= full_scans) - break; - printf("."); - usleep(TICK); - } - - madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE); - - return timeout == -1; -} - -static void khugepaged_collapse(const char *msg, char *p, int nr_hpages, - struct mem_ops *ops, bool expect) -{ - if (wait_for_scan(msg, p, nr_hpages, ops)) { - if (expect) - fail("Timeout"); - else - success("OK"); - return; - } - - /* - * For file and shmem memory, khugepaged only retracts pte entries after - * putting the new hugepage in the page cache. The hugepage must be - * subsequently refaulted to install the pmd mapping for the mm. - */ - if (ops != &__anon_ops) - ops->fault(p, 0, nr_hpages * hpage_pmd_size); - - if (ops->check_huge(p, expect ? nr_hpages : 0)) - success("OK"); - else - fail("Fail"); -} - -static struct collapse_context __khugepaged_context = { - .collapse = &khugepaged_collapse, - .enforce_pte_scan_limits = true, - .name = "khugepaged", -}; - -static struct collapse_context __madvise_context = { - .collapse = &madvise_collapse, - .enforce_pte_scan_limits = false, - .name = "madvise", -}; - -static bool is_tmpfs(struct mem_ops *ops) -{ - return ops == &__file_ops && finfo.type == VMA_SHMEM; -} - -static void alloc_at_fault(void) -{ - struct settings settings = *current_settings(); - char *p; - - settings.thp_enabled = THP_ALWAYS; - push_settings(&settings); - - p = alloc_mapping(1); - *p = 1; - printf("Allocate huge page on fault..."); - if (check_huge_anon(p, 1, hpage_pmd_size)) - success("OK"); - else - fail("Fail"); - - pop_settings(); - - madvise(p, page_size, MADV_DONTNEED); - printf("Split huge PMD on MADV_DONTNEED..."); - if (check_huge_anon(p, 0, hpage_pmd_size)) - success("OK"); - else - fail("Fail"); - munmap(p, hpage_pmd_size); -} - -static void collapse_full(struct collapse_context *c, struct mem_ops *ops) -{ - void *p; - int nr_hpages = 4; - unsigned long size = nr_hpages * hpage_pmd_size; - - p = ops->setup_area(nr_hpages); - ops->fault(p, 0, size); - c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages, - ops, true); - validate_memory(p, 0, size); - ops->cleanup_area(p, size); -} - -static void collapse_empty(struct collapse_context *c, struct mem_ops *ops) -{ - void *p; - - p = ops->setup_area(1); - c->collapse("Do not collapse empty PTE table", p, 1, ops, false); - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops) -{ - void *p; - - p = ops->setup_area(1); - ops->fault(p, 0, page_size); - c->collapse("Collapse PTE table with single PTE entry present", p, - 1, ops, true); - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops) -{ - int max_ptes_none = hpage_pmd_nr / 2; - struct settings settings = *current_settings(); - void *p; - - settings.khugepaged.max_ptes_none = max_ptes_none; - push_settings(&settings); - - p = ops->setup_area(1); - - if (is_tmpfs(ops)) { - /* shmem pages always in the page cache */ - printf("tmpfs..."); - skip("Skip"); - goto skip; - } - - ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); - c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1, - ops, !c->enforce_pte_scan_limits); - validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); - - if (c->enforce_pte_scan_limits) { - ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); - c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops, - true); - validate_memory(p, 0, - (hpage_pmd_nr - max_ptes_none) * page_size); - } -skip: - ops->cleanup_area(p, hpage_pmd_size); - pop_settings(); -} - -static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops) -{ - void *p; - - p = ops->setup_area(1); - ops->fault(p, 0, hpage_pmd_size); - - printf("Swapout one page..."); - if (madvise(p, page_size, MADV_PAGEOUT)) { - perror("madvise(MADV_PAGEOUT)"); - exit(EXIT_FAILURE); - } - if (check_swap(p, page_size)) { - success("OK"); - } else { - fail("Fail"); - goto out; - } - - c->collapse("Collapse with swapping in single PTE entry", p, 1, ops, - true); - validate_memory(p, 0, hpage_pmd_size); -out: - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops) -{ - int max_ptes_swap = read_num("khugepaged/max_ptes_swap"); - void *p; - - p = ops->setup_area(1); - ops->fault(p, 0, hpage_pmd_size); - - printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); - if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) { - perror("madvise(MADV_PAGEOUT)"); - exit(EXIT_FAILURE); - } - if (check_swap(p, (max_ptes_swap + 1) * page_size)) { - success("OK"); - } else { - fail("Fail"); - goto out; - } - - c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops, - !c->enforce_pte_scan_limits); - validate_memory(p, 0, hpage_pmd_size); - - if (c->enforce_pte_scan_limits) { - ops->fault(p, 0, hpage_pmd_size); - printf("Swapout %d of %d pages...", max_ptes_swap, - hpage_pmd_nr); - if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { - perror("madvise(MADV_PAGEOUT)"); - exit(EXIT_FAILURE); - } - if (check_swap(p, max_ptes_swap * page_size)) { - success("OK"); - } else { - fail("Fail"); - goto out; - } - - c->collapse("Collapse with max_ptes_swap pages swapped out", p, - 1, ops, true); - validate_memory(p, 0, hpage_pmd_size); - } -out: - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops) -{ - void *p; - - p = alloc_hpage(ops); - - if (is_tmpfs(ops)) { - /* MADV_DONTNEED won't evict tmpfs pages */ - printf("tmpfs..."); - skip("Skip"); - goto skip; - } - - madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - printf("Split huge page leaving single PTE mapping compound page..."); - madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - - c->collapse("Collapse PTE table with single PTE mapping compound page", - p, 1, ops, true); - validate_memory(p, 0, page_size); -skip: - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops) -{ - void *p; - - p = alloc_hpage(ops); - printf("Split huge page leaving single PTE page table full of compound pages..."); - madvise(p, page_size, MADV_NOHUGEPAGE); - madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - - c->collapse("Collapse PTE table full of compound pages", p, 1, ops, - true); - validate_memory(p, 0, hpage_pmd_size); - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops) -{ - void *p; - int i; - - p = ops->setup_area(1); - for (i = 0; i < hpage_pmd_nr; i++) { - printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", - i + 1, hpage_pmd_nr); - - madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); - ops->fault(BASE_ADDR, 0, hpage_pmd_size); - if (!ops->check_huge(BASE_ADDR, 1)) { - printf("Failed to allocate huge page\n"); - exit(EXIT_FAILURE); - } - madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE); - - p = mremap(BASE_ADDR - i * page_size, - i * page_size + hpage_pmd_size, - (i + 1) * page_size, - MREMAP_MAYMOVE | MREMAP_FIXED, - BASE_ADDR + 2 * hpage_pmd_size); - if (p == MAP_FAILED) { - perror("mremap+unmap"); - exit(EXIT_FAILURE); - } - - p = mremap(BASE_ADDR + 2 * hpage_pmd_size, - (i + 1) * page_size, - (i + 1) * page_size + hpage_pmd_size, - MREMAP_MAYMOVE | MREMAP_FIXED, - BASE_ADDR - (i + 1) * page_size); - if (p == MAP_FAILED) { - perror("mremap+alloc"); - exit(EXIT_FAILURE); - } - } - - ops->cleanup_area(BASE_ADDR, hpage_pmd_size); - ops->fault(p, 0, hpage_pmd_size); - if (!ops->check_huge(p, 1)) - success("OK"); - else - fail("Fail"); - - c->collapse("Collapse PTE table full of different compound pages", p, 1, - ops, true); - - validate_memory(p, 0, hpage_pmd_size); - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_fork(struct collapse_context *c, struct mem_ops *ops) -{ - int wstatus; - void *p; - - p = ops->setup_area(1); - - printf("Allocate small page..."); - ops->fault(p, 0, page_size); - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - - printf("Share small page over fork()..."); - if (!fork()) { - /* Do not touch settings on child exit */ - skip_settings_restore = true; - exit_status = 0; - - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - - ops->fault(p, page_size, 2 * page_size); - c->collapse("Collapse PTE table with single page shared with parent process", - p, 1, ops, true); - - validate_memory(p, 0, page_size); - ops->cleanup_area(p, hpage_pmd_size); - exit(exit_status); - } - - wait(&wstatus); - exit_status += WEXITSTATUS(wstatus); - - printf("Check if parent still has small page..."); - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - validate_memory(p, 0, page_size); - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops) -{ - int wstatus; - void *p; - - p = alloc_hpage(ops); - printf("Share huge page over fork()..."); - if (!fork()) { - /* Do not touch settings on child exit */ - skip_settings_restore = true; - exit_status = 0; - - if (ops->check_huge(p, 1)) - success("OK"); - else - fail("Fail"); - - printf("Split huge page PMD in child process..."); - madvise(p, page_size, MADV_NOHUGEPAGE); - madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - ops->fault(p, 0, page_size); - - write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); - c->collapse("Collapse PTE table full of compound pages in child", - p, 1, ops, true); - write_num("khugepaged/max_ptes_shared", - current_settings()->khugepaged.max_ptes_shared); - - validate_memory(p, 0, hpage_pmd_size); - ops->cleanup_area(p, hpage_pmd_size); - exit(exit_status); - } - - wait(&wstatus); - exit_status += WEXITSTATUS(wstatus); - - printf("Check if parent still has huge page..."); - if (ops->check_huge(p, 1)) - success("OK"); - else - fail("Fail"); - validate_memory(p, 0, hpage_pmd_size); - ops->cleanup_area(p, hpage_pmd_size); -} - -static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops) -{ - int max_ptes_shared = read_num("khugepaged/max_ptes_shared"); - int wstatus; - void *p; - - p = alloc_hpage(ops); - printf("Share huge page over fork()..."); - if (!fork()) { - /* Do not touch settings on child exit */ - skip_settings_restore = true; - exit_status = 0; - - if (ops->check_huge(p, 1)) - success("OK"); - else - fail("Fail"); - - printf("Trigger CoW on page %d of %d...", - hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); - ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - - c->collapse("Maybe collapse with max_ptes_shared exceeded", p, - 1, ops, !c->enforce_pte_scan_limits); - - if (c->enforce_pte_scan_limits) { - printf("Trigger CoW on page %d of %d...", - hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); - ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) * - page_size); - if (ops->check_huge(p, 0)) - success("OK"); - else - fail("Fail"); - - c->collapse("Collapse with max_ptes_shared PTEs shared", - p, 1, ops, true); - } - - validate_memory(p, 0, hpage_pmd_size); - ops->cleanup_area(p, hpage_pmd_size); - exit(exit_status); - } - - wait(&wstatus); - exit_status += WEXITSTATUS(wstatus); - - printf("Check if parent still has huge page..."); - if (ops->check_huge(p, 1)) - success("OK"); - else - fail("Fail"); - validate_memory(p, 0, hpage_pmd_size); - ops->cleanup_area(p, hpage_pmd_size); -} - -static void madvise_collapse_existing_thps(struct collapse_context *c, - struct mem_ops *ops) -{ - void *p; - - p = ops->setup_area(1); - ops->fault(p, 0, hpage_pmd_size); - c->collapse("Collapse fully populated PTE table...", p, 1, ops, true); - validate_memory(p, 0, hpage_pmd_size); - - /* c->collapse() will find a hugepage and complain - call directly. */ - __madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true); - validate_memory(p, 0, hpage_pmd_size); - ops->cleanup_area(p, hpage_pmd_size); -} - -/* - * Test race with khugepaged where page tables have been retracted and - * pmd cleared. - */ -static void madvise_retracted_page_tables(struct collapse_context *c, - struct mem_ops *ops) -{ - void *p; - int nr_hpages = 1; - unsigned long size = nr_hpages * hpage_pmd_size; - - p = ops->setup_area(nr_hpages); - ops->fault(p, 0, size); - - /* Let khugepaged collapse and leave pmd cleared */ - if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages, - ops)) { - fail("Timeout"); - return; - } - success("OK"); - c->collapse("Install huge PMD from page cache", p, nr_hpages, ops, - true); - validate_memory(p, 0, size); - ops->cleanup_area(p, size); -} - -static void usage(void) -{ - fprintf(stderr, "\nUsage: ./khugepaged <test type> [dir]\n\n"); - fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n"); - fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n"); - fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n"); - fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n"); - fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n"); - fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n"); - fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n"); - fprintf(stderr, "\tmounted with huge=madvise option for khugepaged tests to work\n"); - exit(1); -} - -static void parse_test_type(int argc, const char **argv) -{ - char *buf; - const char *token; - - if (argc == 1) { - /* Backwards compatibility */ - khugepaged_context = &__khugepaged_context; - madvise_context = &__madvise_context; - anon_ops = &__anon_ops; - return; - } - - buf = strdup(argv[1]); - token = strsep(&buf, ":"); - - if (!strcmp(token, "all")) { - khugepaged_context = &__khugepaged_context; - madvise_context = &__madvise_context; - } else if (!strcmp(token, "khugepaged")) { - khugepaged_context = &__khugepaged_context; - } else if (!strcmp(token, "madvise")) { - madvise_context = &__madvise_context; - } else { - usage(); - } - - if (!buf) - usage(); - - if (!strcmp(buf, "all")) { - file_ops = &__file_ops; - anon_ops = &__anon_ops; - shmem_ops = &__shmem_ops; - } else if (!strcmp(buf, "anon")) { - anon_ops = &__anon_ops; - } else if (!strcmp(buf, "file")) { - file_ops = &__file_ops; - } else if (!strcmp(buf, "shmem")) { - shmem_ops = &__shmem_ops; - } else { - usage(); - } - - if (!file_ops) - return; - - if (argc != 3) - usage(); -} - -int main(int argc, const char **argv) -{ - struct settings default_settings = { - .thp_enabled = THP_MADVISE, - .thp_defrag = THP_DEFRAG_ALWAYS, - .shmem_enabled = SHMEM_ADVISE, - .use_zero_page = 0, - .khugepaged = { - .defrag = 1, - .alloc_sleep_millisecs = 10, - .scan_sleep_millisecs = 10, - }, - /* - * When testing file-backed memory, the collapse path - * looks at how many pages are found in the page cache, not - * what pages are mapped. Disable read ahead optimization so - * pages don't find their way into the page cache unless - * we mem_ops->fault() them in. - */ - .read_ahead_kb = 0, - }; - - parse_test_type(argc, argv); - - if (file_ops) - get_finfo(argv[2]); - - setbuf(stdout, NULL); - - page_size = getpagesize(); - hpage_pmd_size = read_pmd_pagesize(); - hpage_pmd_nr = hpage_pmd_size / page_size; - - default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; - default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8; - default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2; - default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; - - save_settings(); - push_settings(&default_settings); - - alloc_at_fault(); - -#define TEST(t, c, o) do { \ - if (c && o) { \ - printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \ - t(c, o); \ - } \ - } while (0) - - TEST(collapse_full, khugepaged_context, anon_ops); - TEST(collapse_full, khugepaged_context, file_ops); - TEST(collapse_full, khugepaged_context, shmem_ops); - TEST(collapse_full, madvise_context, anon_ops); - TEST(collapse_full, madvise_context, file_ops); - TEST(collapse_full, madvise_context, shmem_ops); - - TEST(collapse_empty, khugepaged_context, anon_ops); - TEST(collapse_empty, madvise_context, anon_ops); - - TEST(collapse_single_pte_entry, khugepaged_context, anon_ops); - TEST(collapse_single_pte_entry, khugepaged_context, file_ops); - TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops); - TEST(collapse_single_pte_entry, madvise_context, anon_ops); - TEST(collapse_single_pte_entry, madvise_context, file_ops); - TEST(collapse_single_pte_entry, madvise_context, shmem_ops); - - TEST(collapse_max_ptes_none, khugepaged_context, anon_ops); - TEST(collapse_max_ptes_none, khugepaged_context, file_ops); - TEST(collapse_max_ptes_none, madvise_context, anon_ops); - TEST(collapse_max_ptes_none, madvise_context, file_ops); - - TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops); - TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops); - TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops); - TEST(collapse_single_pte_entry_compound, madvise_context, file_ops); - - TEST(collapse_full_of_compound, khugepaged_context, anon_ops); - TEST(collapse_full_of_compound, khugepaged_context, file_ops); - TEST(collapse_full_of_compound, khugepaged_context, shmem_ops); - TEST(collapse_full_of_compound, madvise_context, anon_ops); - TEST(collapse_full_of_compound, madvise_context, file_ops); - TEST(collapse_full_of_compound, madvise_context, shmem_ops); - - TEST(collapse_compound_extreme, khugepaged_context, anon_ops); - TEST(collapse_compound_extreme, madvise_context, anon_ops); - - TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops); - TEST(collapse_swapin_single_pte, madvise_context, anon_ops); - - TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops); - TEST(collapse_max_ptes_swap, madvise_context, anon_ops); - - TEST(collapse_fork, khugepaged_context, anon_ops); - TEST(collapse_fork, madvise_context, anon_ops); - - TEST(collapse_fork_compound, khugepaged_context, anon_ops); - TEST(collapse_fork_compound, madvise_context, anon_ops); - - TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops); - TEST(collapse_max_ptes_shared, madvise_context, anon_ops); - - TEST(madvise_collapse_existing_thps, madvise_context, anon_ops); - TEST(madvise_collapse_existing_thps, madvise_context, file_ops); - TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops); - - TEST(madvise_retracted_page_tables, madvise_context, file_ops); - TEST(madvise_retracted_page_tables, madvise_context, shmem_ops); - - restore_settings(0); -} --- a/tools/testing/selftests/vm/ksm_functional_tests.c +++ /dev/null @@ -1,374 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * KSM functional tests - * - * Copyright 2022, Red Hat, Inc. - * - * Author(s): David Hildenbrand <david@xxxxxxxxxx> - */ -#define _GNU_SOURCE -#include <stdlib.h> -#include <string.h> -#include <stdbool.h> -#include <stdint.h> -#include <unistd.h> -#include <errno.h> -#include <fcntl.h> -#include <sys/mman.h> -#include <sys/syscall.h> -#include <sys/ioctl.h> -#include <linux/userfaultfd.h> - -#include "../kselftest.h" -#include "vm_util.h" - -#define KiB 1024u -#define MiB (1024 * KiB) - -static int ksm_fd; -static int ksm_full_scans_fd; -static int ksm_zero_pages_fd; -static int ksm_use_zero_pages_fd; -static int pagemap_fd; -static size_t pagesize; - -static bool range_maps_duplicates(char *addr, unsigned long size) -{ - unsigned long offs_a, offs_b, pfn_a, pfn_b; - - /* - * There is no easy way to check if there are KSM pages mapped into - * this range. We only check that the range does not map the same PFN - * twice by comparing each pair of mapped pages. - */ - for (offs_a = 0; offs_a < size; offs_a += pagesize) { - pfn_a = pagemap_get_pfn(pagemap_fd, addr + offs_a); - /* Page not present or PFN not exposed by the kernel. */ - if (pfn_a == -1ul || !pfn_a) - continue; - - for (offs_b = offs_a + pagesize; offs_b < size; - offs_b += pagesize) { - pfn_b = pagemap_get_pfn(pagemap_fd, addr + offs_b); - if (pfn_b == -1ul || !pfn_b) - continue; - if (pfn_a == pfn_b) - return true; - } - } - return false; -} - -static bool check_ksm_zero_pages_count(unsigned long zero_size) -{ - unsigned long pages_expected = zero_size / (4 * KiB); - char buf[20]; - ssize_t read_size; - unsigned long ksm_zero_pages; - - read_size = pread(ksm_zero_pages_fd, buf, sizeof(buf) - 1, 0); - if (read_size < 0) - return -errno; - buf[read_size] = 0; - ksm_zero_pages = strtol(buf, NULL, 10); - - return ksm_zero_pages == pages_expected; -} - -static long ksm_get_full_scans(void) -{ - char buf[10]; - ssize_t ret; - - ret = pread(ksm_full_scans_fd, buf, sizeof(buf) - 1, 0); - if (ret <= 0) - return -errno; - buf[ret] = 0; - - return strtol(buf, NULL, 10); -} - -static int wait_two_full_scans(void) -{ - long start_scans, end_scans; - - start_scans = ksm_get_full_scans(); - if (start_scans < 0) - return -errno; - do { - end_scans = ksm_get_full_scans(); - if (end_scans < 0) - return end_scans; - } while (end_scans < start_scans + 2); - - return 0; -} - -static inline int ksm_merge(void) -{ - /* Wait for two full scans such that any possible merging happened. */ - if (write(ksm_fd, "1", 1) != 1) - return -errno; - return wait_two_full_scans(); -} - -static inline int make_cow(char *map, char val, unsigned long size) -{ - - memset(map, val, size); - return wait_two_full_scans(); -} - -static int unmerge_zero_page(char *start, unsigned long size) -{ - int ret; - - ret = madvise(start, size, MADV_UNMERGEABLE); - if (ret) { - ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); - return ret; - } - - return wait_two_full_scans(); -} - -static char *mmap_and_merge_range(char val, unsigned long size) -{ - char *map; - - map = mmap(NULL, size, PROT_READ|PROT_WRITE, - MAP_PRIVATE|MAP_ANON, -1, 0); - if (map == MAP_FAILED) { - ksft_test_result_fail("mmap() failed\n"); - return MAP_FAILED; - } - - /* Don't use THP. Ignore if THP are not around on a kernel. */ - if (madvise(map, size, MADV_NOHUGEPAGE) && errno != EINVAL) { - ksft_test_result_fail("MADV_NOHUGEPAGE failed\n"); - goto unmap; - } - - /* Make sure each page contains the same values to merge them. */ - memset(map, val, size); - if (madvise(map, size, MADV_MERGEABLE)) { - ksft_test_result_fail("MADV_MERGEABLE failed\n"); - goto unmap; - } - - /* Run KSM to trigger merging and wait. */ - if (ksm_merge()) { - ksft_test_result_fail("Running KSM failed\n"); - goto unmap; - } - return map; -unmap: - munmap(map, size); - return MAP_FAILED; -} - -static void test_unmerge(void) -{ - const unsigned int size = 2 * MiB; - char *map; - - ksft_print_msg("[RUN] %s\n", __func__); - - map = mmap_and_merge_range(0xcf, size); - if (map == MAP_FAILED) - return; - - if (madvise(map, size, MADV_UNMERGEABLE)) { - ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); - goto unmap; - } - - ksft_test_result(!range_maps_duplicates(map, size), - "Pages were unmerged\n"); -unmap: - munmap(map, size); -} - -static void test_unmerge_zero_pages(void) -{ - const unsigned int size = 2 * MiB; - char *map; - - ksft_print_msg("[RUN] %s\n", __func__); - - /* Confirm the interfaces*/ - ksm_zero_pages_fd = open("/sys/kernel/mm/ksm/zero_pages_sharing", O_RDONLY); - if (ksm_zero_pages_fd < 0) { - ksft_test_result_skip("open(\"/sys/kernel/mm/ksm/zero_pages_sharing\") failed\n"); - return; - } - ksm_use_zero_pages_fd = open("/sys/kernel/mm/ksm/use_zero_pages", O_RDWR); - if (ksm_use_zero_pages_fd < 0) { - ksft_test_result_skip("open \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n"); - return; - } - if (write(ksm_use_zero_pages_fd, "1", 1) != 1) { - ksft_test_result_skip("write \"/sys/kernel/mm/ksm/use_zero_pages\" failed\n"); - return; - } - - /* Mmap zero pages*/ - map = mmap_and_merge_range(0x00, size); - - /* Case 1: make Writing on ksm zero pages (COW) */ - if (make_cow(map, 0xcf, size / 2)) { - ksft_test_result_fail("COW failed\n"); - goto unmap; - } - ksft_test_result(check_ksm_zero_pages_count(size / 2), - "zero page count react to cow\n"); - - /* Case 2: Call madvise(xxx, MADV_UNMERGEABLE)*/ - if (unmerge_zero_page(map + size / 2, size / 4)) { - ksft_test_result_fail("unmerge_zero_page failed\n"); - goto unmap; - } - ksft_test_result(check_ksm_zero_pages_count(size / 4), - "zero page count react to unmerge\n"); - - /*Check if ksm pages are really unmerged */ - ksft_test_result(!range_maps_duplicates(map + size / 2, size / 4), - "KSM zero pages were unmerged\n"); - -unmap: - munmap(map, size); -} - -static void test_unmerge_discarded(void) -{ - const unsigned int size = 2 * MiB; - char *map; - - ksft_print_msg("[RUN] %s\n", __func__); - - map = mmap_and_merge_range(0xcf, size); - if (map == MAP_FAILED) - return; - - /* Discard half of all mapped pages so we have pte_none() entries. */ - if (madvise(map, size / 2, MADV_DONTNEED)) { - ksft_test_result_fail("MADV_DONTNEED failed\n"); - goto unmap; - } - - if (madvise(map, size, MADV_UNMERGEABLE)) { - ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); - goto unmap; - } - - ksft_test_result(!range_maps_duplicates(map, size), - "Pages were unmerged\n"); -unmap: - munmap(map, size); -} - -#ifdef __NR_userfaultfd -static void test_unmerge_uffd_wp(void) -{ - struct uffdio_writeprotect uffd_writeprotect; - struct uffdio_register uffdio_register; - const unsigned int size = 2 * MiB; - struct uffdio_api uffdio_api; - char *map; - int uffd; - - ksft_print_msg("[RUN] %s\n", __func__); - - map = mmap_and_merge_range(0xcf, size); - if (map == MAP_FAILED) - return; - - /* See if UFFD is around. */ - uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); - if (uffd < 0) { - ksft_test_result_skip("__NR_userfaultfd failed\n"); - goto unmap; - } - - /* See if UFFD-WP is around. */ - uffdio_api.api = UFFD_API; - uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP; - if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) { - ksft_test_result_fail("UFFDIO_API failed\n"); - goto close_uffd; - } - if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) { - ksft_test_result_skip("UFFD_FEATURE_PAGEFAULT_FLAG_WP not available\n"); - goto close_uffd; - } - - /* Register UFFD-WP, no need for an actual handler. */ - uffdio_register.range.start = (unsigned long) map; - uffdio_register.range.len = size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) { - ksft_test_result_fail("UFFDIO_REGISTER_MODE_WP failed\n"); - goto close_uffd; - } - - /* Write-protect the range using UFFD-WP. */ - uffd_writeprotect.range.start = (unsigned long) map; - uffd_writeprotect.range.len = size; - uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP; - if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { - ksft_test_result_fail("UFFDIO_WRITEPROTECT failed\n"); - goto close_uffd; - } - - if (madvise(map, size, MADV_UNMERGEABLE)) { - ksft_test_result_fail("MADV_UNMERGEABLE failed\n"); - goto close_uffd; - } - - ksft_test_result(!range_maps_duplicates(map, size), - "Pages were unmerged\n"); -close_uffd: - close(uffd); -unmap: - munmap(map, size); -} -#endif - -int main(int argc, char **argv) -{ - unsigned int tests = 2; - int err; - -#ifdef __NR_userfaultfd - tests++; -#endif - - ksft_print_header(); - ksft_set_plan(tests); - - pagesize = getpagesize(); - - ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); - if (ksm_fd < 0) - ksft_exit_skip("open(\"/sys/kernel/mm/ksm/run\") failed\n"); - ksm_full_scans_fd = open("/sys/kernel/mm/ksm/full_scans", O_RDONLY); - if (ksm_full_scans_fd < 0) - ksft_exit_skip("open(\"/sys/kernel/mm/ksm/full_scans\") failed\n"); - - pagemap_fd = open("/proc/self/pagemap", O_RDONLY); - if (pagemap_fd < 0) - ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); - - test_unmerge(); - test_unmerge_zero_pages(); - test_unmerge_discarded(); -#ifdef __NR_userfaultfd - test_unmerge_uffd_wp(); -#endif - - err = ksft_get_fail_cnt(); - if (err) - ksft_exit_fail_msg("%d out of %d tests failed\n", - err, ksft_test_num()); - return ksft_exit_pass(); -} --- a/tools/testing/selftests/vm/ksm_tests.c +++ /dev/null @@ -1,849 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -#include <sys/mman.h> -#include <stdbool.h> -#include <time.h> -#include <string.h> -#include <numa.h> -#include <unistd.h> -#include <fcntl.h> -#include <stdint.h> -#include <err.h> - -#include "../kselftest.h" -#include <include/vdso/time64.h> -#include "util.h" - -#define KSM_SYSFS_PATH "/sys/kernel/mm/ksm/" -#define KSM_FP(s) (KSM_SYSFS_PATH s) -#define KSM_SCAN_LIMIT_SEC_DEFAULT 120 -#define KSM_PAGE_COUNT_DEFAULT 10l -#define KSM_PROT_STR_DEFAULT "rw" -#define KSM_USE_ZERO_PAGES_DEFAULT false -#define KSM_MERGE_ACROSS_NODES_DEFAULT true -#define MB (1ul << 20) - -struct ksm_sysfs { - unsigned long max_page_sharing; - unsigned long merge_across_nodes; - unsigned long pages_to_scan; - unsigned long run; - unsigned long sleep_millisecs; - unsigned long stable_node_chains_prune_millisecs; - unsigned long use_zero_pages; -}; - -enum ksm_test_name { - CHECK_KSM_MERGE, - CHECK_KSM_UNMERGE, - CHECK_KSM_ZERO_PAGE_MERGE, - CHECK_KSM_NUMA_MERGE, - KSM_MERGE_TIME, - KSM_MERGE_TIME_HUGE_PAGES, - KSM_UNMERGE_TIME, - KSM_COW_TIME -}; - -static int ksm_write_sysfs(const char *file_path, unsigned long val) -{ - FILE *f = fopen(file_path, "w"); - - if (!f) { - fprintf(stderr, "f %s\n", file_path); - perror("fopen"); - return 1; - } - if (fprintf(f, "%lu", val) < 0) { - perror("fprintf"); - fclose(f); - return 1; - } - fclose(f); - - return 0; -} - -static int ksm_read_sysfs(const char *file_path, unsigned long *val) -{ - FILE *f = fopen(file_path, "r"); - - if (!f) { - fprintf(stderr, "f %s\n", file_path); - perror("fopen"); - return 1; - } - if (fscanf(f, "%lu", val) != 1) { - perror("fscanf"); - fclose(f); - return 1; - } - fclose(f); - - return 0; -} - -static int str_to_prot(char *prot_str) -{ - int prot = 0; - - if ((strchr(prot_str, 'r')) != NULL) - prot |= PROT_READ; - if ((strchr(prot_str, 'w')) != NULL) - prot |= PROT_WRITE; - if ((strchr(prot_str, 'x')) != NULL) - prot |= PROT_EXEC; - - return prot; -} - -static void print_help(void) -{ - printf("usage: ksm_tests [-h] <test type> [-a prot] [-p page_count] [-l timeout]\n" - "[-z use_zero_pages] [-m merge_across_nodes] [-s size]\n"); - - printf("Supported <test type>:\n" - " -M (page merging)\n" - " -Z (zero pages merging)\n" - " -N (merging of pages in different NUMA nodes)\n" - " -U (page unmerging)\n" - " -P evaluate merging time and speed.\n" - " For this test, the size of duplicated memory area (in MiB)\n" - " must be provided using -s option\n" - " -H evaluate merging time and speed of area allocated mostly with huge pages\n" - " For this test, the size of duplicated memory area (in MiB)\n" - " must be provided using -s option\n" - " -D evaluate unmerging time and speed when disabling KSM.\n" - " For this test, the size of duplicated memory area (in MiB)\n" - " must be provided using -s option\n" - " -C evaluate the time required to break COW of merged pages.\n\n"); - - printf(" -a: specify the access protections of pages.\n" - " <prot> must be of the form [rwx].\n" - " Default: %s\n", KSM_PROT_STR_DEFAULT); - printf(" -p: specify the number of pages to test.\n" - " Default: %ld\n", KSM_PAGE_COUNT_DEFAULT); - printf(" -l: limit the maximum running time (in seconds) for a test.\n" - " Default: %d seconds\n", KSM_SCAN_LIMIT_SEC_DEFAULT); - printf(" -z: change use_zero_pages tunable\n" - " Default: %d\n", KSM_USE_ZERO_PAGES_DEFAULT); - printf(" -m: change merge_across_nodes tunable\n" - " Default: %d\n", KSM_MERGE_ACROSS_NODES_DEFAULT); - printf(" -s: the size of duplicated memory area (in MiB)\n"); - - exit(0); -} - -static void *allocate_memory(void *ptr, int prot, int mapping, char data, size_t map_size) -{ - void *map_ptr = mmap(ptr, map_size, PROT_WRITE, mapping, -1, 0); - - if (!map_ptr) { - perror("mmap"); - return NULL; - } - memset(map_ptr, data, map_size); - if (mprotect(map_ptr, map_size, prot)) { - perror("mprotect"); - munmap(map_ptr, map_size); - return NULL; - } - - return map_ptr; -} - -static int ksm_do_scan(int scan_count, struct timespec start_time, int timeout) -{ - struct timespec cur_time; - unsigned long cur_scan, init_scan; - - if (ksm_read_sysfs(KSM_FP("full_scans"), &init_scan)) - return 1; - cur_scan = init_scan; - - while (cur_scan < init_scan + scan_count) { - if (ksm_read_sysfs(KSM_FP("full_scans"), &cur_scan)) - return 1; - if (clock_gettime(CLOCK_MONOTONIC_RAW, &cur_time)) { - perror("clock_gettime"); - return 1; - } - if ((cur_time.tv_sec - start_time.tv_sec) > timeout) { - printf("Scan time limit exceeded\n"); - return 1; - } - } - - return 0; -} - -static int ksm_merge_pages(void *addr, size_t size, struct timespec start_time, int timeout) -{ - if (madvise(addr, size, MADV_MERGEABLE)) { - perror("madvise"); - return 1; - } - if (ksm_write_sysfs(KSM_FP("run"), 1)) - return 1; - - /* Since merging occurs only after 2 scans, make sure to get at least 2 full scans */ - if (ksm_do_scan(2, start_time, timeout)) - return 1; - - return 0; -} - -static int ksm_unmerge_pages(void *addr, size_t size, - struct timespec start_time, int timeout) -{ - if (madvise(addr, size, MADV_UNMERGEABLE)) { - perror("madvise"); - return 1; - } - return 0; -} - -static bool assert_ksm_pages_count(long dupl_page_count) -{ - unsigned long max_page_sharing, pages_sharing, pages_shared; - - if (ksm_read_sysfs(KSM_FP("pages_shared"), &pages_shared) || - ksm_read_sysfs(KSM_FP("pages_sharing"), &pages_sharing) || - ksm_read_sysfs(KSM_FP("max_page_sharing"), &max_page_sharing)) - return false; - - /* - * Since there must be at least 2 pages for merging and 1 page can be - * shared with the limited number of pages (max_page_sharing), sometimes - * there are 'leftover' pages that cannot be merged. For example, if there - * are 11 pages and max_page_sharing = 10, then only 10 pages will be - * merged and the 11th page won't be affected. As a result, when the number - * of duplicate pages is divided by max_page_sharing and the remainder is 1, - * pages_shared and pages_sharing values will be equal between dupl_page_count - * and dupl_page_count - 1. - */ - if (dupl_page_count % max_page_sharing == 1 || dupl_page_count % max_page_sharing == 0) { - if (pages_shared == dupl_page_count / max_page_sharing && - pages_sharing == pages_shared * (max_page_sharing - 1)) - return true; - } else { - if (pages_shared == (dupl_page_count / max_page_sharing + 1) && - pages_sharing == dupl_page_count - pages_shared) - return true; - } - - return false; -} - -static int ksm_save_def(struct ksm_sysfs *ksm_sysfs) -{ - if (ksm_read_sysfs(KSM_FP("max_page_sharing"), &ksm_sysfs->max_page_sharing) || - numa_available() ? 0 : - ksm_read_sysfs(KSM_FP("merge_across_nodes"), &ksm_sysfs->merge_across_nodes) || - ksm_read_sysfs(KSM_FP("sleep_millisecs"), &ksm_sysfs->sleep_millisecs) || - ksm_read_sysfs(KSM_FP("pages_to_scan"), &ksm_sysfs->pages_to_scan) || - ksm_read_sysfs(KSM_FP("run"), &ksm_sysfs->run) || - ksm_read_sysfs(KSM_FP("stable_node_chains_prune_millisecs"), - &ksm_sysfs->stable_node_chains_prune_millisecs) || - ksm_read_sysfs(KSM_FP("use_zero_pages"), &ksm_sysfs->use_zero_pages)) - return 1; - - return 0; -} - -static int ksm_restore(struct ksm_sysfs *ksm_sysfs) -{ - if (ksm_write_sysfs(KSM_FP("max_page_sharing"), ksm_sysfs->max_page_sharing) || - numa_available() ? 0 : - ksm_write_sysfs(KSM_FP("merge_across_nodes"), ksm_sysfs->merge_across_nodes) || - ksm_write_sysfs(KSM_FP("pages_to_scan"), ksm_sysfs->pages_to_scan) || - ksm_write_sysfs(KSM_FP("run"), ksm_sysfs->run) || - ksm_write_sysfs(KSM_FP("sleep_millisecs"), ksm_sysfs->sleep_millisecs) || - ksm_write_sysfs(KSM_FP("stable_node_chains_prune_millisecs"), - ksm_sysfs->stable_node_chains_prune_millisecs) || - ksm_write_sysfs(KSM_FP("use_zero_pages"), ksm_sysfs->use_zero_pages)) - return 1; - - return 0; -} - -static int check_ksm_merge(int mapping, int prot, long page_count, int timeout, size_t page_size) -{ - void *map_ptr; - struct timespec start_time; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - return KSFT_FAIL; - } - - /* fill pages with the same data and merge them */ - map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); - if (!map_ptr) - return KSFT_FAIL; - - if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) - goto err_out; - - /* verify that the right number of pages are merged */ - if (assert_ksm_pages_count(page_count)) { - printf("OK\n"); - munmap(map_ptr, page_size * page_count); - return KSFT_PASS; - } - -err_out: - printf("Not OK\n"); - munmap(map_ptr, page_size * page_count); - return KSFT_FAIL; -} - -static int check_ksm_unmerge(int mapping, int prot, int timeout, size_t page_size) -{ - void *map_ptr; - struct timespec start_time; - int page_count = 2; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - return KSFT_FAIL; - } - - /* fill pages with the same data and merge them */ - map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); - if (!map_ptr) - return KSFT_FAIL; - - if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) - goto err_out; - - /* change 1 byte in each of the 2 pages -- KSM must automatically unmerge them */ - memset(map_ptr, '-', 1); - memset(map_ptr + page_size, '+', 1); - - /* get at least 1 scan, so KSM can detect that the pages were modified */ - if (ksm_do_scan(1, start_time, timeout)) - goto err_out; - - /* check that unmerging was successful and 0 pages are currently merged */ - if (assert_ksm_pages_count(0)) { - printf("OK\n"); - munmap(map_ptr, page_size * page_count); - return KSFT_PASS; - } - -err_out: - printf("Not OK\n"); - munmap(map_ptr, page_size * page_count); - return KSFT_FAIL; -} - -static int check_ksm_zero_page_merge(int mapping, int prot, long page_count, int timeout, - bool use_zero_pages, size_t page_size) -{ - void *map_ptr; - struct timespec start_time; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - return KSFT_FAIL; - } - - if (ksm_write_sysfs(KSM_FP("use_zero_pages"), use_zero_pages)) - return KSFT_FAIL; - - /* fill pages with zero and try to merge them */ - map_ptr = allocate_memory(NULL, prot, mapping, 0, page_size * page_count); - if (!map_ptr) - return KSFT_FAIL; - - if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) - goto err_out; - - /* - * verify that the right number of pages are merged: - * 1) if use_zero_pages is set to 1, empty pages are merged - * with the kernel zero page instead of with each other; - * 2) if use_zero_pages is set to 0, empty pages are not treated specially - * and merged as usual. - */ - if (use_zero_pages && !assert_ksm_pages_count(0)) - goto err_out; - else if (!use_zero_pages && !assert_ksm_pages_count(page_count)) - goto err_out; - - printf("OK\n"); - munmap(map_ptr, page_size * page_count); - return KSFT_PASS; - -err_out: - printf("Not OK\n"); - munmap(map_ptr, page_size * page_count); - return KSFT_FAIL; -} - -static int get_next_mem_node(int node) -{ - - long node_size; - int mem_node = 0; - int i, max_node = numa_max_node(); - - for (i = node + 1; i <= max_node + node; i++) { - mem_node = i % (max_node + 1); - node_size = numa_node_size(mem_node, NULL); - if (node_size > 0) - break; - } - return mem_node; -} - -static int get_first_mem_node(void) -{ - return get_next_mem_node(numa_max_node()); -} - -static int check_ksm_numa_merge(int mapping, int prot, int timeout, bool merge_across_nodes, - size_t page_size) -{ - void *numa1_map_ptr, *numa2_map_ptr; - struct timespec start_time; - int page_count = 2; - int first_node; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - return KSFT_FAIL; - } - - if (numa_available() < 0) { - perror("NUMA support not enabled"); - return KSFT_SKIP; - } - if (numa_num_configured_nodes() <= 1) { - printf("At least 2 NUMA nodes must be available\n"); - return KSFT_SKIP; - } - if (ksm_write_sysfs(KSM_FP("merge_across_nodes"), merge_across_nodes)) - return KSFT_FAIL; - - /* allocate 2 pages in 2 different NUMA nodes and fill them with the same data */ - first_node = get_first_mem_node(); - numa1_map_ptr = numa_alloc_onnode(page_size, first_node); - numa2_map_ptr = numa_alloc_onnode(page_size, get_next_mem_node(first_node)); - if (!numa1_map_ptr || !numa2_map_ptr) { - perror("numa_alloc_onnode"); - return KSFT_FAIL; - } - - memset(numa1_map_ptr, '*', page_size); - memset(numa2_map_ptr, '*', page_size); - - /* try to merge the pages */ - if (ksm_merge_pages(numa1_map_ptr, page_size, start_time, timeout) || - ksm_merge_pages(numa2_map_ptr, page_size, start_time, timeout)) - goto err_out; - - /* - * verify that the right number of pages are merged: - * 1) if merge_across_nodes was enabled, 2 duplicate pages will be merged; - * 2) if merge_across_nodes = 0, there must be 0 merged pages, since there is - * only 1 unique page in each node and they can't be shared. - */ - if (merge_across_nodes && !assert_ksm_pages_count(page_count)) - goto err_out; - else if (!merge_across_nodes && !assert_ksm_pages_count(0)) - goto err_out; - - numa_free(numa1_map_ptr, page_size); - numa_free(numa2_map_ptr, page_size); - printf("OK\n"); - return KSFT_PASS; - -err_out: - numa_free(numa1_map_ptr, page_size); - numa_free(numa2_map_ptr, page_size); - printf("Not OK\n"); - return KSFT_FAIL; -} - -static int ksm_merge_hugepages_time(int mapping, int prot, int timeout, size_t map_size) -{ - void *map_ptr, *map_ptr_orig; - struct timespec start_time, end_time; - unsigned long scan_time_ns; - int pagemap_fd, n_normal_pages, n_huge_pages; - - map_size *= MB; - size_t len = map_size; - - len -= len % HPAGE_SIZE; - map_ptr_orig = mmap(NULL, len + HPAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE, -1, 0); - map_ptr = map_ptr_orig + HPAGE_SIZE - (uintptr_t)map_ptr_orig % HPAGE_SIZE; - - if (map_ptr_orig == MAP_FAILED) - err(2, "initial mmap"); - - if (madvise(map_ptr, len + HPAGE_SIZE, MADV_HUGEPAGE)) - err(2, "MADV_HUGEPAGE"); - - pagemap_fd = open("/proc/self/pagemap", O_RDONLY); - if (pagemap_fd < 0) - err(2, "open pagemap"); - - n_normal_pages = 0; - n_huge_pages = 0; - for (void *p = map_ptr; p < map_ptr + len; p += HPAGE_SIZE) { - if (allocate_transhuge(p, pagemap_fd) < 0) - n_normal_pages++; - else - n_huge_pages++; - } - printf("Number of normal pages: %d\n", n_normal_pages); - printf("Number of huge pages: %d\n", n_huge_pages); - - memset(map_ptr, '*', len); - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - goto err_out; - } - if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) - goto err_out; - if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { - perror("clock_gettime"); - goto err_out; - } - - scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + - (end_time.tv_nsec - start_time.tv_nsec); - - printf("Total size: %lu MiB\n", map_size / MB); - printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, - scan_time_ns % NSEC_PER_SEC); - printf("Average speed: %.3f MiB/s\n", (map_size / MB) / - ((double)scan_time_ns / NSEC_PER_SEC)); - - munmap(map_ptr_orig, len + HPAGE_SIZE); - return KSFT_PASS; - -err_out: - printf("Not OK\n"); - munmap(map_ptr_orig, len + HPAGE_SIZE); - return KSFT_FAIL; -} - -static int ksm_merge_time(int mapping, int prot, int timeout, size_t map_size) -{ - void *map_ptr; - struct timespec start_time, end_time; - unsigned long scan_time_ns; - - map_size *= MB; - - map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size); - if (!map_ptr) - return KSFT_FAIL; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - goto err_out; - } - if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) - goto err_out; - if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { - perror("clock_gettime"); - goto err_out; - } - - scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + - (end_time.tv_nsec - start_time.tv_nsec); - - printf("Total size: %lu MiB\n", map_size / MB); - printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, - scan_time_ns % NSEC_PER_SEC); - printf("Average speed: %.3f MiB/s\n", (map_size / MB) / - ((double)scan_time_ns / NSEC_PER_SEC)); - - munmap(map_ptr, map_size); - return KSFT_PASS; - -err_out: - printf("Not OK\n"); - munmap(map_ptr, map_size); - return KSFT_FAIL; -} - -static int ksm_unmerge_time(int mapping, int prot, int timeout, size_t map_size) -{ - void *map_ptr; - struct timespec start_time, end_time; - unsigned long scan_time_ns; - - map_size *= MB; - - map_ptr = allocate_memory(NULL, prot, mapping, '*', map_size); - if (!map_ptr) - return KSFT_FAIL; - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - goto err_out; - } - if (ksm_merge_pages(map_ptr, map_size, start_time, timeout)) - goto err_out; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - goto err_out; - } - if (ksm_unmerge_pages(map_ptr, map_size, start_time, timeout)) - goto err_out; - if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { - perror("clock_gettime"); - goto err_out; - } - - scan_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + - (end_time.tv_nsec - start_time.tv_nsec); - - printf("Total size: %lu MiB\n", map_size / MB); - printf("Total time: %ld.%09ld s\n", scan_time_ns / NSEC_PER_SEC, - scan_time_ns % NSEC_PER_SEC); - printf("Average speed: %.3f MiB/s\n", (map_size / MB) / - ((double)scan_time_ns / NSEC_PER_SEC)); - - munmap(map_ptr, map_size); - return KSFT_PASS; - -err_out: - printf("Not OK\n"); - munmap(map_ptr, map_size); - return KSFT_FAIL; -} - -static int ksm_cow_time(int mapping, int prot, int timeout, size_t page_size) -{ - void *map_ptr; - struct timespec start_time, end_time; - unsigned long cow_time_ns; - - /* page_count must be less than 2*page_size */ - size_t page_count = 4000; - - map_ptr = allocate_memory(NULL, prot, mapping, '*', page_size * page_count); - if (!map_ptr) - return KSFT_FAIL; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - return KSFT_FAIL; - } - for (size_t i = 0; i < page_count - 1; i = i + 2) - memset(map_ptr + page_size * i, '-', 1); - if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { - perror("clock_gettime"); - return KSFT_FAIL; - } - - cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + - (end_time.tv_nsec - start_time.tv_nsec); - - printf("Total size: %lu MiB\n\n", (page_size * page_count) / MB); - printf("Not merged pages:\n"); - printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, - cow_time_ns % NSEC_PER_SEC); - printf("Average speed: %.3f MiB/s\n\n", ((page_size * (page_count / 2)) / MB) / - ((double)cow_time_ns / NSEC_PER_SEC)); - - /* Create 2000 pairs of duplicate pages */ - for (size_t i = 0; i < page_count - 1; i = i + 2) { - memset(map_ptr + page_size * i, '+', i / 2 + 1); - memset(map_ptr + page_size * (i + 1), '+', i / 2 + 1); - } - if (ksm_merge_pages(map_ptr, page_size * page_count, start_time, timeout)) - goto err_out; - - if (clock_gettime(CLOCK_MONOTONIC_RAW, &start_time)) { - perror("clock_gettime"); - goto err_out; - } - for (size_t i = 0; i < page_count - 1; i = i + 2) - memset(map_ptr + page_size * i, '-', 1); - if (clock_gettime(CLOCK_MONOTONIC_RAW, &end_time)) { - perror("clock_gettime"); - goto err_out; - } - - cow_time_ns = (end_time.tv_sec - start_time.tv_sec) * NSEC_PER_SEC + - (end_time.tv_nsec - start_time.tv_nsec); - - printf("Merged pages:\n"); - printf("Total time: %ld.%09ld s\n", cow_time_ns / NSEC_PER_SEC, - cow_time_ns % NSEC_PER_SEC); - printf("Average speed: %.3f MiB/s\n", ((page_size * (page_count / 2)) / MB) / - ((double)cow_time_ns / NSEC_PER_SEC)); - - munmap(map_ptr, page_size * page_count); - return KSFT_PASS; - -err_out: - printf("Not OK\n"); - munmap(map_ptr, page_size * page_count); - return KSFT_FAIL; -} - -int main(int argc, char *argv[]) -{ - int ret, opt; - int prot = 0; - int ksm_scan_limit_sec = KSM_SCAN_LIMIT_SEC_DEFAULT; - long page_count = KSM_PAGE_COUNT_DEFAULT; - size_t page_size = sysconf(_SC_PAGESIZE); - struct ksm_sysfs ksm_sysfs_old; - int test_name = CHECK_KSM_MERGE; - bool use_zero_pages = KSM_USE_ZERO_PAGES_DEFAULT; - bool merge_across_nodes = KSM_MERGE_ACROSS_NODES_DEFAULT; - long size_MB = 0; - - while ((opt = getopt(argc, argv, "ha:p:l:z:m:s:MUZNPCHD")) != -1) { - switch (opt) { - case 'a': - prot = str_to_prot(optarg); - break; - case 'p': - page_count = atol(optarg); - if (page_count <= 0) { - printf("The number of pages must be greater than 0\n"); - return KSFT_FAIL; - } - break; - case 'l': - ksm_scan_limit_sec = atoi(optarg); - if (ksm_scan_limit_sec <= 0) { - printf("Timeout value must be greater than 0\n"); - return KSFT_FAIL; - } - break; - case 'h': - print_help(); - break; - case 'z': - if (strcmp(optarg, "0") == 0) - use_zero_pages = 0; - else - use_zero_pages = 1; - break; - case 'm': - if (strcmp(optarg, "0") == 0) - merge_across_nodes = 0; - else - merge_across_nodes = 1; - break; - case 's': - size_MB = atoi(optarg); - if (size_MB <= 0) { - printf("Size must be greater than 0\n"); - return KSFT_FAIL; - } - case 'M': - break; - case 'U': - test_name = CHECK_KSM_UNMERGE; - break; - case 'Z': - test_name = CHECK_KSM_ZERO_PAGE_MERGE; - break; - case 'N': - test_name = CHECK_KSM_NUMA_MERGE; - break; - case 'P': - test_name = KSM_MERGE_TIME; - break; - case 'H': - test_name = KSM_MERGE_TIME_HUGE_PAGES; - break; - case 'D': - test_name = KSM_UNMERGE_TIME; - break; - case 'C': - test_name = KSM_COW_TIME; - break; - default: - return KSFT_FAIL; - } - } - - if (prot == 0) - prot = str_to_prot(KSM_PROT_STR_DEFAULT); - - if (access(KSM_SYSFS_PATH, F_OK)) { - printf("Config KSM not enabled\n"); - return KSFT_SKIP; - } - - if (ksm_save_def(&ksm_sysfs_old)) { - printf("Cannot save default tunables\n"); - return KSFT_FAIL; - } - - if (ksm_write_sysfs(KSM_FP("run"), 2) || - ksm_write_sysfs(KSM_FP("sleep_millisecs"), 0) || - numa_available() ? 0 : - ksm_write_sysfs(KSM_FP("merge_across_nodes"), 1) || - ksm_write_sysfs(KSM_FP("pages_to_scan"), page_count)) - return KSFT_FAIL; - - switch (test_name) { - case CHECK_KSM_MERGE: - ret = check_ksm_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, - ksm_scan_limit_sec, page_size); - break; - case CHECK_KSM_UNMERGE: - ret = check_ksm_unmerge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, - page_size); - break; - case CHECK_KSM_ZERO_PAGE_MERGE: - ret = check_ksm_zero_page_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, page_count, - ksm_scan_limit_sec, use_zero_pages, page_size); - break; - case CHECK_KSM_NUMA_MERGE: - ret = check_ksm_numa_merge(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, - merge_across_nodes, page_size); - break; - case KSM_MERGE_TIME: - if (size_MB == 0) { - printf("Option '-s' is required.\n"); - return KSFT_FAIL; - } - ret = ksm_merge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, - size_MB); - break; - case KSM_MERGE_TIME_HUGE_PAGES: - if (size_MB == 0) { - printf("Option '-s' is required.\n"); - return KSFT_FAIL; - } - ret = ksm_merge_hugepages_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, - ksm_scan_limit_sec, size_MB); - break; - case KSM_UNMERGE_TIME: - if (size_MB == 0) { - printf("Option '-s' is required.\n"); - return KSFT_FAIL; - } - ret = ksm_unmerge_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, - ksm_scan_limit_sec, size_MB); - break; - case KSM_COW_TIME: - ret = ksm_cow_time(MAP_PRIVATE | MAP_ANONYMOUS, prot, ksm_scan_limit_sec, - page_size); - break; - } - - if (ksm_restore(&ksm_sysfs_old)) { - printf("Cannot restore default tunables\n"); - return KSFT_FAIL; - } - - return ret; -} --- a/tools/testing/selftests/vm/madv_populate.c +++ /dev/null @@ -1,296 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * MADV_POPULATE_READ and MADV_POPULATE_WRITE tests - * - * Copyright 2021, Red Hat, Inc. - * - * Author(s): David Hildenbrand <david@xxxxxxxxxx> - */ -#define _GNU_SOURCE -#include <stdlib.h> -#include <string.h> -#include <stdbool.h> -#include <stdint.h> -#include <unistd.h> -#include <errno.h> -#include <fcntl.h> -#include <linux/mman.h> -#include <sys/mman.h> - -#include "../kselftest.h" -#include "vm_util.h" - -#ifndef MADV_POPULATE_READ -#define MADV_POPULATE_READ 22 -#endif /* MADV_POPULATE_READ */ -#ifndef MADV_POPULATE_WRITE -#define MADV_POPULATE_WRITE 23 -#endif /* MADV_POPULATE_WRITE */ - -/* - * For now, we're using 2 MiB of private anonymous memory for all tests. - */ -#define SIZE (2 * 1024 * 1024) - -static size_t pagesize; - -static void sense_support(void) -{ - char *addr; - int ret; - - addr = mmap(0, pagesize, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (!addr) - ksft_exit_fail_msg("mmap failed\n"); - - ret = madvise(addr, pagesize, MADV_POPULATE_READ); - if (ret) - ksft_exit_skip("MADV_POPULATE_READ is not available\n"); - - ret = madvise(addr, pagesize, MADV_POPULATE_WRITE); - if (ret) - ksft_exit_skip("MADV_POPULATE_WRITE is not available\n"); - - munmap(addr, pagesize); -} - -static void test_prot_read(void) -{ - char *addr; - int ret; - - ksft_print_msg("[RUN] %s\n", __func__); - - addr = mmap(0, SIZE, PROT_READ, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (addr == MAP_FAILED) - ksft_exit_fail_msg("mmap failed\n"); - - ret = madvise(addr, SIZE, MADV_POPULATE_READ); - ksft_test_result(!ret, "MADV_POPULATE_READ with PROT_READ\n"); - - ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); - ksft_test_result(ret == -1 && errno == EINVAL, - "MADV_POPULATE_WRITE with PROT_READ\n"); - - munmap(addr, SIZE); -} - -static void test_prot_write(void) -{ - char *addr; - int ret; - - ksft_print_msg("[RUN] %s\n", __func__); - - addr = mmap(0, SIZE, PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (addr == MAP_FAILED) - ksft_exit_fail_msg("mmap failed\n"); - - ret = madvise(addr, SIZE, MADV_POPULATE_READ); - ksft_test_result(ret == -1 && errno == EINVAL, - "MADV_POPULATE_READ with PROT_WRITE\n"); - - ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); - ksft_test_result(!ret, "MADV_POPULATE_WRITE with PROT_WRITE\n"); - - munmap(addr, SIZE); -} - -static void test_holes(void) -{ - char *addr; - int ret; - - ksft_print_msg("[RUN] %s\n", __func__); - - addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (addr == MAP_FAILED) - ksft_exit_fail_msg("mmap failed\n"); - ret = munmap(addr + pagesize, pagesize); - if (ret) - ksft_exit_fail_msg("munmap failed\n"); - - /* Hole in the middle */ - ret = madvise(addr, SIZE, MADV_POPULATE_READ); - ksft_test_result(ret == -1 && errno == ENOMEM, - "MADV_POPULATE_READ with holes in the middle\n"); - ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); - ksft_test_result(ret == -1 && errno == ENOMEM, - "MADV_POPULATE_WRITE with holes in the middle\n"); - - /* Hole at end */ - ret = madvise(addr, 2 * pagesize, MADV_POPULATE_READ); - ksft_test_result(ret == -1 && errno == ENOMEM, - "MADV_POPULATE_READ with holes at the end\n"); - ret = madvise(addr, 2 * pagesize, MADV_POPULATE_WRITE); - ksft_test_result(ret == -1 && errno == ENOMEM, - "MADV_POPULATE_WRITE with holes at the end\n"); - - /* Hole at beginning */ - ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_READ); - ksft_test_result(ret == -1 && errno == ENOMEM, - "MADV_POPULATE_READ with holes at the beginning\n"); - ret = madvise(addr + pagesize, pagesize, MADV_POPULATE_WRITE); - ksft_test_result(ret == -1 && errno == ENOMEM, - "MADV_POPULATE_WRITE with holes at the beginning\n"); - - munmap(addr, SIZE); -} - -static bool range_is_populated(char *start, ssize_t size) -{ - int fd = open("/proc/self/pagemap", O_RDONLY); - bool ret = true; - - if (fd < 0) - ksft_exit_fail_msg("opening pagemap failed\n"); - for (; size > 0 && ret; size -= pagesize, start += pagesize) - if (!pagemap_is_populated(fd, start)) - ret = false; - close(fd); - return ret; -} - -static bool range_is_not_populated(char *start, ssize_t size) -{ - int fd = open("/proc/self/pagemap", O_RDONLY); - bool ret = true; - - if (fd < 0) - ksft_exit_fail_msg("opening pagemap failed\n"); - for (; size > 0 && ret; size -= pagesize, start += pagesize) - if (pagemap_is_populated(fd, start)) - ret = false; - close(fd); - return ret; -} - -static void test_populate_read(void) -{ - char *addr; - int ret; - - ksft_print_msg("[RUN] %s\n", __func__); - - addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (addr == MAP_FAILED) - ksft_exit_fail_msg("mmap failed\n"); - ksft_test_result(range_is_not_populated(addr, SIZE), - "range initially not populated\n"); - - ret = madvise(addr, SIZE, MADV_POPULATE_READ); - ksft_test_result(!ret, "MADV_POPULATE_READ\n"); - ksft_test_result(range_is_populated(addr, SIZE), - "range is populated\n"); - - munmap(addr, SIZE); -} - -static void test_populate_write(void) -{ - char *addr; - int ret; - - ksft_print_msg("[RUN] %s\n", __func__); - - addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (addr == MAP_FAILED) - ksft_exit_fail_msg("mmap failed\n"); - ksft_test_result(range_is_not_populated(addr, SIZE), - "range initially not populated\n"); - - ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); - ksft_test_result(!ret, "MADV_POPULATE_WRITE\n"); - ksft_test_result(range_is_populated(addr, SIZE), - "range is populated\n"); - - munmap(addr, SIZE); -} - -static bool range_is_softdirty(char *start, ssize_t size) -{ - int fd = open("/proc/self/pagemap", O_RDONLY); - bool ret = true; - - if (fd < 0) - ksft_exit_fail_msg("opening pagemap failed\n"); - for (; size > 0 && ret; size -= pagesize, start += pagesize) - if (!pagemap_is_softdirty(fd, start)) - ret = false; - close(fd); - return ret; -} - -static bool range_is_not_softdirty(char *start, ssize_t size) -{ - int fd = open("/proc/self/pagemap", O_RDONLY); - bool ret = true; - - if (fd < 0) - ksft_exit_fail_msg("opening pagemap failed\n"); - for (; size > 0 && ret; size -= pagesize, start += pagesize) - if (pagemap_is_softdirty(fd, start)) - ret = false; - close(fd); - return ret; -} - -static void test_softdirty(void) -{ - char *addr; - int ret; - - ksft_print_msg("[RUN] %s\n", __func__); - - addr = mmap(0, SIZE, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, 0, 0); - if (addr == MAP_FAILED) - ksft_exit_fail_msg("mmap failed\n"); - - /* Clear any softdirty bits. */ - clear_softdirty(); - ksft_test_result(range_is_not_softdirty(addr, SIZE), - "range is not softdirty\n"); - - /* Populating READ should set softdirty. */ - ret = madvise(addr, SIZE, MADV_POPULATE_READ); - ksft_test_result(!ret, "MADV_POPULATE_READ\n"); - ksft_test_result(range_is_not_softdirty(addr, SIZE), - "range is not softdirty\n"); - - /* Populating WRITE should set softdirty. */ - ret = madvise(addr, SIZE, MADV_POPULATE_WRITE); - ksft_test_result(!ret, "MADV_POPULATE_WRITE\n"); - ksft_test_result(range_is_softdirty(addr, SIZE), - "range is softdirty\n"); - - munmap(addr, SIZE); -} - -int main(int argc, char **argv) -{ - int err; - - pagesize = getpagesize(); - - ksft_print_header(); - ksft_set_plan(21); - - sense_support(); - test_prot_read(); - test_prot_write(); - test_holes(); - test_populate_read(); - test_populate_write(); - test_softdirty(); - - err = ksft_get_fail_cnt(); - if (err) - ksft_exit_fail_msg("%d out of %d tests failed\n", - err, ksft_test_num()); - return ksft_exit_pass(); -} --- a/tools/testing/selftests/vm/Makefile +++ /dev/null @@ -1,180 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0 -# Makefile for vm selftests - -LOCAL_HDRS += $(selfdir)/vm/local_config.h $(top_srcdir)/mm/gup_test.h - -include local_config.mk - -uname_M := $(shell uname -m 2>/dev/null || echo not) -MACHINE ?= $(shell echo $(uname_M) | sed -e 's/aarch64.*/arm64/' -e 's/ppc64.*/ppc64/') - -# Without this, failed build products remain, with up-to-date timestamps, -# thus tricking Make (and you!) into believing that All Is Well, in subsequent -# make invocations: -.DELETE_ON_ERROR: - -# Avoid accidental wrong builds, due to built-in rules working just a little -# bit too well--but not quite as well as required for our situation here. -# -# In other words, "make userfaultfd" is supposed to fail to build at all, -# because this Makefile only supports either "make" (all), or "make /full/path". -# However, the built-in rules, if not suppressed, will pick up CFLAGS and the -# initial LDLIBS (but not the target-specific LDLIBS, because those are only -# set for the full path target!). This causes it to get pretty far into building -# things despite using incorrect values such as an *occasionally* incomplete -# LDLIBS. -MAKEFLAGS += --no-builtin-rules - -CFLAGS = -Wall -I $(top_srcdir) -I $(top_srcdir)/usr/include $(EXTRA_CFLAGS) $(KHDR_INCLUDES) -LDLIBS = -lrt -lpthread -TEST_GEN_FILES = cow -TEST_GEN_FILES += compaction_test -TEST_GEN_FILES += gup_test -TEST_GEN_FILES += hmm-tests -TEST_GEN_FILES += hugetlb-madvise -TEST_GEN_FILES += hugepage-mmap -TEST_GEN_FILES += hugepage-mremap -TEST_GEN_FILES += hugepage-shm -TEST_GEN_FILES += hugepage-vmemmap -TEST_GEN_FILES += khugepaged -TEST_GEN_PROGS = madv_populate -TEST_GEN_FILES += map_fixed_noreplace -TEST_GEN_FILES += map_hugetlb -TEST_GEN_FILES += map_populate -TEST_GEN_FILES += memfd_secret -TEST_GEN_FILES += migration -TEST_GEN_FILES += mlock-random-test -TEST_GEN_FILES += mlock2-tests -TEST_GEN_FILES += mrelease_test -TEST_GEN_FILES += mremap_dontunmap -TEST_GEN_FILES += mremap_test -TEST_GEN_FILES += on-fault-limit -TEST_GEN_FILES += thuge-gen -TEST_GEN_FILES += transhuge-stress -TEST_GEN_FILES += userfaultfd -TEST_GEN_PROGS += soft-dirty -TEST_GEN_PROGS += split_huge_page_test -TEST_GEN_FILES += ksm_tests -TEST_GEN_PROGS += ksm_functional_tests - -ifeq ($(MACHINE),x86_64) -CAN_BUILD_I386 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_32bit_program.c -m32) -CAN_BUILD_X86_64 := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_64bit_program.c) -CAN_BUILD_WITH_NOPIE := $(shell ./../x86/check_cc.sh "$(CC)" ../x86/trivial_program.c -no-pie) - -VMTARGETS := protection_keys -BINARIES_32 := $(VMTARGETS:%=%_32) -BINARIES_64 := $(VMTARGETS:%=%_64) - -ifeq ($(CAN_BUILD_WITH_NOPIE),1) -CFLAGS += -no-pie -endif - -ifeq ($(CAN_BUILD_I386),1) -TEST_GEN_FILES += $(BINARIES_32) -endif - -ifeq ($(CAN_BUILD_X86_64),1) -TEST_GEN_FILES += $(BINARIES_64) -endif -else - -ifneq (,$(findstring $(MACHINE),ppc64)) -TEST_GEN_FILES += protection_keys -endif - -endif - -ifneq (,$(filter $(MACHINE),arm64 ia64 mips64 parisc64 ppc64 riscv64 s390x sh64 sparc64 x86_64)) -TEST_GEN_FILES += va_128TBswitch -TEST_GEN_FILES += virtual_address_range -TEST_GEN_FILES += write_to_hugetlbfs -endif - -TEST_PROGS := run_vmtests.sh - -TEST_FILES := test_vmalloc.sh -TEST_FILES += test_hmm.sh -TEST_FILES += va_128TBswitch.sh - -include ../lib.mk - -$(OUTPUT)/cow: vm_util.c -$(OUTPUT)/khugepaged: vm_util.c -$(OUTPUT)/ksm_functional_tests: vm_util.c -$(OUTPUT)/madv_populate: vm_util.c -$(OUTPUT)/soft-dirty: vm_util.c -$(OUTPUT)/split_huge_page_test: vm_util.c -$(OUTPUT)/userfaultfd: vm_util.c - -ifeq ($(MACHINE),x86_64) -BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) -BINARIES_64 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_64)) - -define gen-target-rule-32 -$(1) $(1)_32: $(OUTPUT)/$(1)_32 -.PHONY: $(1) $(1)_32 -endef - -define gen-target-rule-64 -$(1) $(1)_64: $(OUTPUT)/$(1)_64 -.PHONY: $(1) $(1)_64 -endef - -ifeq ($(CAN_BUILD_I386),1) -$(BINARIES_32): CFLAGS += -m32 -mxsave -$(BINARIES_32): LDLIBS += -lrt -ldl -lm -$(BINARIES_32): $(OUTPUT)/%_32: %.c - $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ -$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-32,$(t)))) -endif - -ifeq ($(CAN_BUILD_X86_64),1) -$(BINARIES_64): CFLAGS += -m64 -mxsave -$(BINARIES_64): LDLIBS += -lrt -ldl -$(BINARIES_64): $(OUTPUT)/%_64: %.c - $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(notdir $^) $(LDLIBS) -o $@ -$(foreach t,$(VMTARGETS),$(eval $(call gen-target-rule-64,$(t)))) -endif - -# x86_64 users should be encouraged to install 32-bit libraries -ifeq ($(CAN_BUILD_I386)$(CAN_BUILD_X86_64),01) -all: warn_32bit_failure - -warn_32bit_failure: - @echo "Warning: you seem to have a broken 32-bit build" 2>&1; \ - echo "environment. This will reduce test coverage of 64-bit" 2>&1; \ - echo "kernels. If you are using a Debian-like distribution," 2>&1; \ - echo "try:"; 2>&1; \ - echo ""; \ - echo " apt-get install gcc-multilib libc6-i386 libc6-dev-i386"; \ - echo ""; \ - echo "If you are using a Fedora-like distribution, try:"; \ - echo ""; \ - echo " yum install glibc-devel.*i686"; \ - exit 0; -endif -endif - -# cow_EXTRA_LIBS may get set in local_config.mk, or it may be left empty. -$(OUTPUT)/cow: LDLIBS += $(COW_EXTRA_LIBS) - -$(OUTPUT)/mlock-random-test $(OUTPUT)/memfd_secret: LDLIBS += -lcap - -$(OUTPUT)/ksm_tests: LDLIBS += -lnuma - -$(OUTPUT)/migration: LDLIBS += -lnuma - -local_config.mk local_config.h: check_config.sh - /bin/sh ./check_config.sh $(CC) - -EXTRA_CLEAN += local_config.mk local_config.h - -ifeq ($(COW_EXTRA_LIBS),) -all: warn_missing_liburing - -warn_missing_liburing: - @echo ; \ - echo "Warning: missing liburing support. Some COW tests will be skipped." ; \ - echo -endif --- a/tools/testing/selftests/vm/map_fixed_noreplace.c +++ /dev/null @@ -1,231 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -/* - * Test that MAP_FIXED_NOREPLACE works. - * - * Copyright 2018, Jann Horn <jannh@xxxxxxxxxx> - * Copyright 2018, Michael Ellerman, IBM Corporation. - */ - -#include <sys/mman.h> -#include <errno.h> -#include <stdio.h> -#include <stdlib.h> -#include <unistd.h> - -#ifndef MAP_FIXED_NOREPLACE -#define MAP_FIXED_NOREPLACE 0x100000 -#endif - -static void dump_maps(void) -{ - char cmd[32]; - - snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid()); - system(cmd); -} - -static unsigned long find_base_addr(unsigned long size) -{ - void *addr; - unsigned long flags; - - flags = MAP_PRIVATE | MAP_ANONYMOUS; - addr = mmap(NULL, size, PROT_NONE, flags, -1, 0); - if (addr == MAP_FAILED) { - printf("Error: couldn't map the space we need for the test\n"); - return 0; - } - - if (munmap(addr, size) != 0) { - printf("Error: couldn't map the space we need for the test\n"); - return 0; - } - return (unsigned long)addr; -} - -int main(void) -{ - unsigned long base_addr; - unsigned long flags, addr, size, page_size; - char *p; - - page_size = sysconf(_SC_PAGE_SIZE); - - //let's find a base addr that is free before we start the tests - size = 5 * page_size; - base_addr = find_base_addr(size); - if (!base_addr) { - printf("Error: couldn't map the space we need for the test\n"); - return 1; - } - - flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED_NOREPLACE; - - // Check we can map all the areas we need below - errno = 0; - addr = base_addr; - size = 5 * page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p == MAP_FAILED) { - dump_maps(); - printf("Error: couldn't map the space we need for the test\n"); - return 1; - } - - errno = 0; - if (munmap((void *)addr, 5 * page_size) != 0) { - dump_maps(); - printf("Error: munmap failed!?\n"); - return 1; - } - printf("unmap() successful\n"); - - errno = 0; - addr = base_addr + page_size; - size = 3 * page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p == MAP_FAILED) { - dump_maps(); - printf("Error: first mmap() failed unexpectedly\n"); - return 1; - } - - /* - * Exact same mapping again: - * base | free | new - * +1 | mapped | new - * +2 | mapped | new - * +3 | mapped | new - * +4 | free | new - */ - errno = 0; - addr = base_addr; - size = 5 * page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p != MAP_FAILED) { - dump_maps(); - printf("Error:1: mmap() succeeded when it shouldn't have\n"); - return 1; - } - - /* - * Second mapping contained within first: - * - * base | free | - * +1 | mapped | - * +2 | mapped | new - * +3 | mapped | - * +4 | free | - */ - errno = 0; - addr = base_addr + (2 * page_size); - size = page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p != MAP_FAILED) { - dump_maps(); - printf("Error:2: mmap() succeeded when it shouldn't have\n"); - return 1; - } - - /* - * Overlap end of existing mapping: - * base | free | - * +1 | mapped | - * +2 | mapped | - * +3 | mapped | new - * +4 | free | new - */ - errno = 0; - addr = base_addr + (3 * page_size); - size = 2 * page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p != MAP_FAILED) { - dump_maps(); - printf("Error:3: mmap() succeeded when it shouldn't have\n"); - return 1; - } - - /* - * Overlap start of existing mapping: - * base | free | new - * +1 | mapped | new - * +2 | mapped | - * +3 | mapped | - * +4 | free | - */ - errno = 0; - addr = base_addr; - size = 2 * page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p != MAP_FAILED) { - dump_maps(); - printf("Error:4: mmap() succeeded when it shouldn't have\n"); - return 1; - } - - /* - * Adjacent to start of existing mapping: - * base | free | new - * +1 | mapped | - * +2 | mapped | - * +3 | mapped | - * +4 | free | - */ - errno = 0; - addr = base_addr; - size = page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p == MAP_FAILED) { - dump_maps(); - printf("Error:5: mmap() failed when it shouldn't have\n"); - return 1; - } - - /* - * Adjacent to end of existing mapping: - * base | free | - * +1 | mapped | - * +2 | mapped | - * +3 | mapped | - * +4 | free | new - */ - errno = 0; - addr = base_addr + (4 * page_size); - size = page_size; - p = mmap((void *)addr, size, PROT_NONE, flags, -1, 0); - printf("mmap() @ 0x%lx-0x%lx p=%p result=%m\n", addr, addr + size, p); - - if (p == MAP_FAILED) { - dump_maps(); - printf("Error:6: mmap() failed when it shouldn't have\n"); - return 1; - } - - addr = base_addr; - size = 5 * page_size; - if (munmap((void *)addr, size) != 0) { - dump_maps(); - printf("Error: munmap failed!?\n"); - return 1; - } - printf("unmap() successful\n"); - - printf("OK\n"); - return 0; -} --- a/tools/testing/selftests/vm/map_hugetlb.c +++ /dev/null @@ -1,109 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Example of using hugepage memory in a user application using the mmap - * system call with MAP_HUGETLB flag. Before running this program make - * sure the administrator has allocated enough default sized huge pages - * to cover the 256 MB allocation. - * - * For ia64 architecture, Linux kernel reserves Region number 4 for hugepages. - * That means the addresses starting with 0x800000... will need to be - * specified. Specifying a fixed address is not required on ppc64, i386 - * or x86_64. - */ -#include <stdlib.h> -#include <stdio.h> -#include <unistd.h> -#include <sys/mman.h> -#include <fcntl.h> - -#define LENGTH (256UL*1024*1024) -#define PROTECTION (PROT_READ | PROT_WRITE) - -#ifndef MAP_HUGETLB -#define MAP_HUGETLB 0x40000 /* arch specific */ -#endif - -#ifndef MAP_HUGE_SHIFT -#define MAP_HUGE_SHIFT 26 -#endif - -#ifndef MAP_HUGE_MASK -#define MAP_HUGE_MASK 0x3f -#endif - -/* Only ia64 requires this */ -#ifdef __ia64__ -#define ADDR (void *)(0x8000000000000000UL) -#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_FIXED) -#else -#define ADDR (void *)(0x0UL) -#define FLAGS (MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB) -#endif - -static void check_bytes(char *addr) -{ - printf("First hex is %x\n", *((unsigned int *)addr)); -} - -static void write_bytes(char *addr, size_t length) -{ - unsigned long i; - - for (i = 0; i < length; i++) - *(addr + i) = (char)i; -} - -static int read_bytes(char *addr, size_t length) -{ - unsigned long i; - - check_bytes(addr); - for (i = 0; i < length; i++) - if (*(addr + i) != (char)i) { - printf("Mismatch at %lu\n", i); - return 1; - } - return 0; -} - -int main(int argc, char **argv) -{ - void *addr; - int ret; - size_t length = LENGTH; - int flags = FLAGS; - int shift = 0; - - if (argc > 1) - length = atol(argv[1]) << 20; - if (argc > 2) { - shift = atoi(argv[2]); - if (shift) - flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; - } - - if (shift) - printf("%u kB hugepages\n", 1 << (shift - 10)); - else - printf("Default size hugepages\n"); - printf("Mapping %lu Mbytes\n", (unsigned long)length >> 20); - - addr = mmap(ADDR, length, PROTECTION, flags, -1, 0); - if (addr == MAP_FAILED) { - perror("mmap"); - exit(1); - } - - printf("Returned address is %p\n", addr); - check_bytes(addr); - write_bytes(addr, length); - ret = read_bytes(addr, length); - - /* munmap() length of MAP_HUGETLB memory must be hugepage aligned */ - if (munmap(addr, length)) { - perror("munmap"); - exit(1); - } - - return ret; -} --- a/tools/testing/selftests/vm/map_populate.c +++ /dev/null @@ -1,113 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2018 Dmitry Safonov, Arista Networks - * - * MAP_POPULATE | MAP_PRIVATE should COW VMA pages. - */ - -#define _GNU_SOURCE -#include <errno.h> -#include <fcntl.h> -#include <sys/mman.h> -#include <sys/socket.h> -#include <sys/types.h> -#include <sys/wait.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <unistd.h> - -#ifndef MMAP_SZ -#define MMAP_SZ 4096 -#endif - -#define BUG_ON(condition, description) \ - do { \ - if (condition) { \ - fprintf(stderr, "[FAIL]\t%s:%d\t%s:%s\n", __func__, \ - __LINE__, (description), strerror(errno)); \ - exit(1); \ - } \ - } while (0) - -static int parent_f(int sock, unsigned long *smap, int child) -{ - int status, ret; - - ret = read(sock, &status, sizeof(int)); - BUG_ON(ret <= 0, "read(sock)"); - - *smap = 0x22222BAD; - ret = msync(smap, MMAP_SZ, MS_SYNC); - BUG_ON(ret, "msync()"); - - ret = write(sock, &status, sizeof(int)); - BUG_ON(ret <= 0, "write(sock)"); - - waitpid(child, &status, 0); - BUG_ON(!WIFEXITED(status), "child in unexpected state"); - - return WEXITSTATUS(status); -} - -static int child_f(int sock, unsigned long *smap, int fd) -{ - int ret, buf = 0; - - smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_POPULATE, fd, 0); - BUG_ON(smap == MAP_FAILED, "mmap()"); - - BUG_ON(*smap != 0xdeadbabe, "MAP_PRIVATE | MAP_POPULATE changed file"); - - ret = write(sock, &buf, sizeof(int)); - BUG_ON(ret <= 0, "write(sock)"); - - ret = read(sock, &buf, sizeof(int)); - BUG_ON(ret <= 0, "read(sock)"); - - BUG_ON(*smap == 0x22222BAD, "MAP_POPULATE didn't COW private page"); - BUG_ON(*smap != 0xdeadbabe, "mapping was corrupted"); - - return 0; -} - -int main(int argc, char **argv) -{ - int sock[2], child, ret; - FILE *ftmp; - unsigned long *smap; - - ftmp = tmpfile(); - BUG_ON(ftmp == 0, "tmpfile()"); - - ret = ftruncate(fileno(ftmp), MMAP_SZ); - BUG_ON(ret, "ftruncate()"); - - smap = mmap(0, MMAP_SZ, PROT_READ | PROT_WRITE, - MAP_SHARED, fileno(ftmp), 0); - BUG_ON(smap == MAP_FAILED, "mmap()"); - - *smap = 0xdeadbabe; - /* Probably unnecessary, but let it be. */ - ret = msync(smap, MMAP_SZ, MS_SYNC); - BUG_ON(ret, "msync()"); - - ret = socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sock); - BUG_ON(ret, "socketpair()"); - - child = fork(); - BUG_ON(child == -1, "fork()"); - - if (child) { - ret = close(sock[0]); - BUG_ON(ret, "close()"); - - return parent_f(sock[1], smap, child); - } - - ret = close(sock[1]); - BUG_ON(ret, "close()"); - - return child_f(sock[0], smap, fileno(ftmp)); -} --- a/tools/testing/selftests/vm/memfd_secret.c +++ /dev/null @@ -1,296 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright IBM Corporation, 2021 - * - * Author: Mike Rapoport <rppt@xxxxxxxxxxxxx> - */ - -#define _GNU_SOURCE -#include <sys/uio.h> -#include <sys/mman.h> -#include <sys/wait.h> -#include <sys/types.h> -#include <sys/ptrace.h> -#include <sys/syscall.h> -#include <sys/resource.h> -#include <sys/capability.h> - -#include <stdlib.h> -#include <string.h> -#include <unistd.h> -#include <errno.h> -#include <stdio.h> - -#include "../kselftest.h" - -#define fail(fmt, ...) ksft_test_result_fail(fmt, ##__VA_ARGS__) -#define pass(fmt, ...) ksft_test_result_pass(fmt, ##__VA_ARGS__) -#define skip(fmt, ...) ksft_test_result_skip(fmt, ##__VA_ARGS__) - -#ifdef __NR_memfd_secret - -#define PATTERN 0x55 - -static const int prot = PROT_READ | PROT_WRITE; -static const int mode = MAP_SHARED; - -static unsigned long page_size; -static unsigned long mlock_limit_cur; -static unsigned long mlock_limit_max; - -static int memfd_secret(unsigned int flags) -{ - return syscall(__NR_memfd_secret, flags); -} - -static void test_file_apis(int fd) -{ - char buf[64]; - - if ((read(fd, buf, sizeof(buf)) >= 0) || - (write(fd, buf, sizeof(buf)) >= 0) || - (pread(fd, buf, sizeof(buf), 0) >= 0) || - (pwrite(fd, buf, sizeof(buf), 0) >= 0)) - fail("unexpected file IO\n"); - else - pass("file IO is blocked as expected\n"); -} - -static void test_mlock_limit(int fd) -{ - size_t len; - char *mem; - - len = mlock_limit_cur; - mem = mmap(NULL, len, prot, mode, fd, 0); - if (mem == MAP_FAILED) { - fail("unable to mmap secret memory\n"); - return; - } - munmap(mem, len); - - len = mlock_limit_max * 2; - mem = mmap(NULL, len, prot, mode, fd, 0); - if (mem != MAP_FAILED) { - fail("unexpected mlock limit violation\n"); - munmap(mem, len); - return; - } - - pass("mlock limit is respected\n"); -} - -static void try_process_vm_read(int fd, int pipefd[2]) -{ - struct iovec liov, riov; - char buf[64]; - char *mem; - - if (read(pipefd[0], &mem, sizeof(mem)) < 0) { - fail("pipe write: %s\n", strerror(errno)); - exit(KSFT_FAIL); - } - - liov.iov_len = riov.iov_len = sizeof(buf); - liov.iov_base = buf; - riov.iov_base = mem; - - if (process_vm_readv(getppid(), &liov, 1, &riov, 1, 0) < 0) { - if (errno == ENOSYS) - exit(KSFT_SKIP); - exit(KSFT_PASS); - } - - exit(KSFT_FAIL); -} - -static void try_ptrace(int fd, int pipefd[2]) -{ - pid_t ppid = getppid(); - int status; - char *mem; - long ret; - - if (read(pipefd[0], &mem, sizeof(mem)) < 0) { - perror("pipe write"); - exit(KSFT_FAIL); - } - - ret = ptrace(PTRACE_ATTACH, ppid, 0, 0); - if (ret) { - perror("ptrace_attach"); - exit(KSFT_FAIL); - } - - ret = waitpid(ppid, &status, WUNTRACED); - if ((ret != ppid) || !(WIFSTOPPED(status))) { - fprintf(stderr, "weird waitppid result %ld stat %x\n", - ret, status); - exit(KSFT_FAIL); - } - - if (ptrace(PTRACE_PEEKDATA, ppid, mem, 0)) - exit(KSFT_PASS); - - exit(KSFT_FAIL); -} - -static void check_child_status(pid_t pid, const char *name) -{ - int status; - - waitpid(pid, &status, 0); - - if (WIFEXITED(status) && WEXITSTATUS(status) == KSFT_SKIP) { - skip("%s is not supported\n", name); - return; - } - - if ((WIFEXITED(status) && WEXITSTATUS(status) == KSFT_PASS) || - WIFSIGNALED(status)) { - pass("%s is blocked as expected\n", name); - return; - } - - fail("%s: unexpected memory access\n", name); -} - -static void test_remote_access(int fd, const char *name, - void (*func)(int fd, int pipefd[2])) -{ - int pipefd[2]; - pid_t pid; - char *mem; - - if (pipe(pipefd)) { - fail("pipe failed: %s\n", strerror(errno)); - return; - } - - pid = fork(); - if (pid < 0) { - fail("fork failed: %s\n", strerror(errno)); - return; - } - - if (pid == 0) { - func(fd, pipefd); - return; - } - - mem = mmap(NULL, page_size, prot, mode, fd, 0); - if (mem == MAP_FAILED) { - fail("Unable to mmap secret memory\n"); - return; - } - - ftruncate(fd, page_size); - memset(mem, PATTERN, page_size); - - if (write(pipefd[1], &mem, sizeof(mem)) < 0) { - fail("pipe write: %s\n", strerror(errno)); - return; - } - - check_child_status(pid, name); -} - -static void test_process_vm_read(int fd) -{ - test_remote_access(fd, "process_vm_read", try_process_vm_read); -} - -static void test_ptrace(int fd) -{ - test_remote_access(fd, "ptrace", try_ptrace); -} - -static int set_cap_limits(rlim_t max) -{ - struct rlimit new; - cap_t cap = cap_init(); - - new.rlim_cur = max; - new.rlim_max = max; - if (setrlimit(RLIMIT_MEMLOCK, &new)) { - perror("setrlimit() returns error"); - return -1; - } - - /* drop capabilities including CAP_IPC_LOCK */ - if (cap_set_proc(cap)) { - perror("cap_set_proc() returns error"); - return -2; - } - - return 0; -} - -static void prepare(void) -{ - struct rlimit rlim; - - page_size = sysconf(_SC_PAGE_SIZE); - if (!page_size) - ksft_exit_fail_msg("Failed to get page size %s\n", - strerror(errno)); - - if (getrlimit(RLIMIT_MEMLOCK, &rlim)) - ksft_exit_fail_msg("Unable to detect mlock limit: %s\n", - strerror(errno)); - - mlock_limit_cur = rlim.rlim_cur; - mlock_limit_max = rlim.rlim_max; - - printf("page_size: %ld, mlock.soft: %ld, mlock.hard: %ld\n", - page_size, mlock_limit_cur, mlock_limit_max); - - if (page_size > mlock_limit_cur) - mlock_limit_cur = page_size; - if (page_size > mlock_limit_max) - mlock_limit_max = page_size; - - if (set_cap_limits(mlock_limit_max)) - ksft_exit_fail_msg("Unable to set mlock limit: %s\n", - strerror(errno)); -} - -#define NUM_TESTS 4 - -int main(int argc, char *argv[]) -{ - int fd; - - prepare(); - - ksft_print_header(); - ksft_set_plan(NUM_TESTS); - - fd = memfd_secret(0); - if (fd < 0) { - if (errno == ENOSYS) - ksft_exit_skip("memfd_secret is not supported\n"); - else - ksft_exit_fail_msg("memfd_secret failed: %s\n", - strerror(errno)); - } - - test_mlock_limit(fd); - test_file_apis(fd); - test_process_vm_read(fd); - test_ptrace(fd); - - close(fd); - - ksft_finished(); -} - -#else /* __NR_memfd_secret */ - -int main(int argc, char *argv[]) -{ - printf("skip: skipping memfd_secret test (missing __NR_memfd_secret)\n"); - return KSFT_SKIP; -} - -#endif /* __NR_memfd_secret */ --- a/tools/testing/selftests/vm/migration.c +++ /dev/null @@ -1,193 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * The main purpose of the tests here is to exercise the migration entry code - * paths in the kernel. - */ - -#include "../kselftest_harness.h" -#include <strings.h> -#include <pthread.h> -#include <numa.h> -#include <numaif.h> -#include <sys/mman.h> -#include <sys/types.h> -#include <signal.h> -#include <time.h> - -#define TWOMEG (2<<20) -#define RUNTIME (60) - -#define ALIGN(x, a) (((x) + (a - 1)) & (~((a) - 1))) - -FIXTURE(migration) -{ - pthread_t *threads; - pid_t *pids; - int nthreads; - int n1; - int n2; -}; - -FIXTURE_SETUP(migration) -{ - int n; - - ASSERT_EQ(numa_available(), 0); - self->nthreads = numa_num_task_cpus() - 1; - self->n1 = -1; - self->n2 = -1; - - for (n = 0; n < numa_max_possible_node(); n++) - if (numa_bitmask_isbitset(numa_all_nodes_ptr, n)) { - if (self->n1 == -1) { - self->n1 = n; - } else { - self->n2 = n; - break; - } - } - - self->threads = malloc(self->nthreads * sizeof(*self->threads)); - ASSERT_NE(self->threads, NULL); - self->pids = malloc(self->nthreads * sizeof(*self->pids)); - ASSERT_NE(self->pids, NULL); -}; - -FIXTURE_TEARDOWN(migration) -{ - free(self->threads); - free(self->pids); -} - -int migrate(uint64_t *ptr, int n1, int n2) -{ - int ret, tmp; - int status = 0; - struct timespec ts1, ts2; - - if (clock_gettime(CLOCK_MONOTONIC, &ts1)) - return -1; - - while (1) { - if (clock_gettime(CLOCK_MONOTONIC, &ts2)) - return -1; - - if (ts2.tv_sec - ts1.tv_sec >= RUNTIME) - return 0; - - ret = move_pages(0, 1, (void **) &ptr, &n2, &status, - MPOL_MF_MOVE_ALL); - if (ret) { - if (ret > 0) - printf("Didn't migrate %d pages\n", ret); - else - perror("Couldn't migrate pages"); - return -2; - } - - tmp = n2; - n2 = n1; - n1 = tmp; - } - - return 0; -} - -void *access_mem(void *ptr) -{ - uint64_t y = 0; - volatile uint64_t *x = ptr; - - while (1) { - pthread_testcancel(); - y += *x; - } - - return NULL; -} - -/* - * Basic migration entry testing. One thread will move pages back and forth - * between nodes whilst other threads try and access them triggering the - * migration entry wait paths in the kernel. - */ -TEST_F_TIMEOUT(migration, private_anon, 2*RUNTIME) -{ - uint64_t *ptr; - int i; - - if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) - SKIP(return, "Not enough threads or NUMA nodes available"); - - ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - ASSERT_NE(ptr, MAP_FAILED); - - memset(ptr, 0xde, TWOMEG); - for (i = 0; i < self->nthreads - 1; i++) - if (pthread_create(&self->threads[i], NULL, access_mem, ptr)) - perror("Couldn't create thread"); - - ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); - for (i = 0; i < self->nthreads - 1; i++) - ASSERT_EQ(pthread_cancel(self->threads[i]), 0); -} - -/* - * Same as the previous test but with shared memory. - */ -TEST_F_TIMEOUT(migration, shared_anon, 2*RUNTIME) -{ - pid_t pid; - uint64_t *ptr; - int i; - - if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) - SKIP(return, "Not enough threads or NUMA nodes available"); - - ptr = mmap(NULL, TWOMEG, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, -1, 0); - ASSERT_NE(ptr, MAP_FAILED); - - memset(ptr, 0xde, TWOMEG); - for (i = 0; i < self->nthreads - 1; i++) { - pid = fork(); - if (!pid) - access_mem(ptr); - else - self->pids[i] = pid; - } - - ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); - for (i = 0; i < self->nthreads - 1; i++) - ASSERT_EQ(kill(self->pids[i], SIGTERM), 0); -} - -/* - * Tests the pmd migration entry paths. - */ -TEST_F_TIMEOUT(migration, private_anon_thp, 2*RUNTIME) -{ - uint64_t *ptr; - int i; - - if (self->nthreads < 2 || self->n1 < 0 || self->n2 < 0) - SKIP(return, "Not enough threads or NUMA nodes available"); - - ptr = mmap(NULL, 2*TWOMEG, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - ASSERT_NE(ptr, MAP_FAILED); - - ptr = (uint64_t *) ALIGN((uintptr_t) ptr, TWOMEG); - ASSERT_EQ(madvise(ptr, TWOMEG, MADV_HUGEPAGE), 0); - memset(ptr, 0xde, TWOMEG); - for (i = 0; i < self->nthreads - 1; i++) - if (pthread_create(&self->threads[i], NULL, access_mem, ptr)) - perror("Couldn't create thread"); - - ASSERT_EQ(migrate(ptr, self->n1, self->n2), 0); - for (i = 0; i < self->nthreads - 1; i++) - ASSERT_EQ(pthread_cancel(self->threads[i]), 0); -} - -TEST_HARNESS_MAIN --- a/tools/testing/selftests/vm/mlock2.h +++ /dev/null @@ -1,63 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include <syscall.h> -#include <errno.h> -#include <stdio.h> -#include <stdlib.h> - -#ifndef MLOCK_ONFAULT -#define MLOCK_ONFAULT 1 -#endif - -#ifndef MCL_ONFAULT -#define MCL_ONFAULT (MCL_FUTURE << 1) -#endif - -static int mlock2_(void *start, size_t len, int flags) -{ -#ifdef __NR_mlock2 - return syscall(__NR_mlock2, start, len, flags); -#else - errno = ENOSYS; - return -1; -#endif -} - -static FILE *seek_to_smaps_entry(unsigned long addr) -{ - FILE *file; - char *line = NULL; - size_t size = 0; - unsigned long start, end; - char perms[5]; - unsigned long offset; - char dev[32]; - unsigned long inode; - char path[BUFSIZ]; - - file = fopen("/proc/self/smaps", "r"); - if (!file) { - perror("fopen smaps"); - _exit(1); - } - - while (getline(&line, &size, file) > 0) { - if (sscanf(line, "%lx-%lx %s %lx %s %lu %s\n", - &start, &end, perms, &offset, dev, &inode, path) < 6) - goto next; - - if (start <= addr && addr < end) - goto out; - -next: - free(line); - line = NULL; - size = 0; - } - - fclose(file); - file = NULL; - -out: - free(line); - return file; -} --- a/tools/testing/selftests/vm/mlock2-tests.c +++ /dev/null @@ -1,520 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE -#include <sys/mman.h> -#include <stdint.h> -#include <unistd.h> -#include <string.h> -#include <sys/time.h> -#include <sys/resource.h> -#include <stdbool.h> -#include "mlock2.h" - -#include "../kselftest.h" - -struct vm_boundaries { - unsigned long start; - unsigned long end; -}; - -static int get_vm_area(unsigned long addr, struct vm_boundaries *area) -{ - FILE *file; - int ret = 1; - char line[1024] = {0}; - char *end_addr; - char *stop; - unsigned long start; - unsigned long end; - - if (!area) - return ret; - - file = fopen("/proc/self/maps", "r"); - if (!file) { - perror("fopen"); - return ret; - } - - memset(area, 0, sizeof(struct vm_boundaries)); - - while(fgets(line, 1024, file)) { - end_addr = strchr(line, '-'); - if (!end_addr) { - printf("cannot parse /proc/self/maps\n"); - goto out; - } - *end_addr = '\0'; - end_addr++; - stop = strchr(end_addr, ' '); - if (!stop) { - printf("cannot parse /proc/self/maps\n"); - goto out; - } - stop = '\0'; - - sscanf(line, "%lx", &start); - sscanf(end_addr, "%lx", &end); - - if (start <= addr && end > addr) { - area->start = start; - area->end = end; - ret = 0; - goto out; - } - } -out: - fclose(file); - return ret; -} - -#define VMFLAGS "VmFlags:" - -static bool is_vmflag_set(unsigned long addr, const char *vmflag) -{ - char *line = NULL; - char *flags; - size_t size = 0; - bool ret = false; - FILE *smaps; - - smaps = seek_to_smaps_entry(addr); - if (!smaps) { - printf("Unable to parse /proc/self/smaps\n"); - goto out; - } - - while (getline(&line, &size, smaps) > 0) { - if (!strstr(line, VMFLAGS)) { - free(line); - line = NULL; - size = 0; - continue; - } - - flags = line + strlen(VMFLAGS); - ret = (strstr(flags, vmflag) != NULL); - goto out; - } - -out: - free(line); - fclose(smaps); - return ret; -} - -#define SIZE "Size:" -#define RSS "Rss:" -#define LOCKED "lo" - -static unsigned long get_value_for_name(unsigned long addr, const char *name) -{ - char *line = NULL; - size_t size = 0; - char *value_ptr; - FILE *smaps = NULL; - unsigned long value = -1UL; - - smaps = seek_to_smaps_entry(addr); - if (!smaps) { - printf("Unable to parse /proc/self/smaps\n"); - goto out; - } - - while (getline(&line, &size, smaps) > 0) { - if (!strstr(line, name)) { - free(line); - line = NULL; - size = 0; - continue; - } - - value_ptr = line + strlen(name); - if (sscanf(value_ptr, "%lu kB", &value) < 1) { - printf("Unable to parse smaps entry for Size\n"); - goto out; - } - break; - } - -out: - if (smaps) - fclose(smaps); - free(line); - return value; -} - -static bool is_vma_lock_on_fault(unsigned long addr) -{ - bool locked; - unsigned long vma_size, vma_rss; - - locked = is_vmflag_set(addr, LOCKED); - if (!locked) - return false; - - vma_size = get_value_for_name(addr, SIZE); - vma_rss = get_value_for_name(addr, RSS); - - /* only one page is faulted in */ - return (vma_rss < vma_size); -} - -#define PRESENT_BIT 0x8000000000000000ULL -#define PFN_MASK 0x007FFFFFFFFFFFFFULL -#define UNEVICTABLE_BIT (1UL << 18) - -static int lock_check(unsigned long addr) -{ - bool locked; - unsigned long vma_size, vma_rss; - - locked = is_vmflag_set(addr, LOCKED); - if (!locked) - return false; - - vma_size = get_value_for_name(addr, SIZE); - vma_rss = get_value_for_name(addr, RSS); - - return (vma_rss == vma_size); -} - -static int unlock_lock_check(char *map) -{ - if (is_vmflag_set((unsigned long)map, LOCKED)) { - printf("VMA flag %s is present on page 1 after unlock\n", LOCKED); - return 1; - } - - return 0; -} - -static int test_mlock_lock() -{ - char *map; - int ret = 1; - unsigned long page_size = getpagesize(); - - map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (map == MAP_FAILED) { - perror("test_mlock_locked mmap"); - goto out; - } - - if (mlock2_(map, 2 * page_size, 0)) { - if (errno == ENOSYS) { - printf("Cannot call new mlock family, skipping test\n"); - _exit(KSFT_SKIP); - } - perror("mlock2(0)"); - goto unmap; - } - - if (!lock_check((unsigned long)map)) - goto unmap; - - /* Now unlock and recheck attributes */ - if (munlock(map, 2 * page_size)) { - perror("munlock()"); - goto unmap; - } - - ret = unlock_lock_check(map); - -unmap: - munmap(map, 2 * page_size); -out: - return ret; -} - -static int onfault_check(char *map) -{ - *map = 'a'; - if (!is_vma_lock_on_fault((unsigned long)map)) { - printf("VMA is not marked for lock on fault\n"); - return 1; - } - - return 0; -} - -static int unlock_onfault_check(char *map) -{ - unsigned long page_size = getpagesize(); - - if (is_vma_lock_on_fault((unsigned long)map) || - is_vma_lock_on_fault((unsigned long)map + page_size)) { - printf("VMA is still lock on fault after unlock\n"); - return 1; - } - - return 0; -} - -static int test_mlock_onfault() -{ - char *map; - int ret = 1; - unsigned long page_size = getpagesize(); - - map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (map == MAP_FAILED) { - perror("test_mlock_locked mmap"); - goto out; - } - - if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) { - if (errno == ENOSYS) { - printf("Cannot call new mlock family, skipping test\n"); - _exit(KSFT_SKIP); - } - perror("mlock2(MLOCK_ONFAULT)"); - goto unmap; - } - - if (onfault_check(map)) - goto unmap; - - /* Now unlock and recheck attributes */ - if (munlock(map, 2 * page_size)) { - if (errno == ENOSYS) { - printf("Cannot call new mlock family, skipping test\n"); - _exit(KSFT_SKIP); - } - perror("munlock()"); - goto unmap; - } - - ret = unlock_onfault_check(map); -unmap: - munmap(map, 2 * page_size); -out: - return ret; -} - -static int test_lock_onfault_of_present() -{ - char *map; - int ret = 1; - unsigned long page_size = getpagesize(); - - map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (map == MAP_FAILED) { - perror("test_mlock_locked mmap"); - goto out; - } - - *map = 'a'; - - if (mlock2_(map, 2 * page_size, MLOCK_ONFAULT)) { - if (errno == ENOSYS) { - printf("Cannot call new mlock family, skipping test\n"); - _exit(KSFT_SKIP); - } - perror("mlock2(MLOCK_ONFAULT)"); - goto unmap; - } - - if (!is_vma_lock_on_fault((unsigned long)map) || - !is_vma_lock_on_fault((unsigned long)map + page_size)) { - printf("VMA with present pages is not marked lock on fault\n"); - goto unmap; - } - ret = 0; -unmap: - munmap(map, 2 * page_size); -out: - return ret; -} - -static int test_munlockall() -{ - char *map; - int ret = 1; - unsigned long page_size = getpagesize(); - - map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - - if (map == MAP_FAILED) { - perror("test_munlockall mmap"); - goto out; - } - - if (mlockall(MCL_CURRENT)) { - perror("mlockall(MCL_CURRENT)"); - goto out; - } - - if (!lock_check((unsigned long)map)) - goto unmap; - - if (munlockall()) { - perror("munlockall()"); - goto unmap; - } - - if (unlock_lock_check(map)) - goto unmap; - - munmap(map, 2 * page_size); - - map = mmap(NULL, 2 * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - - if (map == MAP_FAILED) { - perror("test_munlockall second mmap"); - goto out; - } - - if (mlockall(MCL_CURRENT | MCL_ONFAULT)) { - perror("mlockall(MCL_CURRENT | MCL_ONFAULT)"); - goto unmap; - } - - if (onfault_check(map)) - goto unmap; - - if (munlockall()) { - perror("munlockall()"); - goto unmap; - } - - if (unlock_onfault_check(map)) - goto unmap; - - if (mlockall(MCL_CURRENT | MCL_FUTURE)) { - perror("mlockall(MCL_CURRENT | MCL_FUTURE)"); - goto out; - } - - if (!lock_check((unsigned long)map)) - goto unmap; - - if (munlockall()) { - perror("munlockall()"); - goto unmap; - } - - ret = unlock_lock_check(map); - -unmap: - munmap(map, 2 * page_size); -out: - munlockall(); - return ret; -} - -static int test_vma_management(bool call_mlock) -{ - int ret = 1; - void *map; - unsigned long page_size = getpagesize(); - struct vm_boundaries page1; - struct vm_boundaries page2; - struct vm_boundaries page3; - - map = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (map == MAP_FAILED) { - perror("mmap()"); - return ret; - } - - if (call_mlock && mlock2_(map, 3 * page_size, MLOCK_ONFAULT)) { - if (errno == ENOSYS) { - printf("Cannot call new mlock family, skipping test\n"); - _exit(KSFT_SKIP); - } - perror("mlock(ONFAULT)\n"); - goto out; - } - - if (get_vm_area((unsigned long)map, &page1) || - get_vm_area((unsigned long)map + page_size, &page2) || - get_vm_area((unsigned long)map + page_size * 2, &page3)) { - printf("couldn't find mapping in /proc/self/maps\n"); - goto out; - } - - /* - * Before we unlock a portion, we need to that all three pages are in - * the same VMA. If they are not we abort this test (Note that this is - * not a failure) - */ - if (page1.start != page2.start || page2.start != page3.start) { - printf("VMAs are not merged to start, aborting test\n"); - ret = 0; - goto out; - } - - if (munlock(map + page_size, page_size)) { - perror("munlock()"); - goto out; - } - - if (get_vm_area((unsigned long)map, &page1) || - get_vm_area((unsigned long)map + page_size, &page2) || - get_vm_area((unsigned long)map + page_size * 2, &page3)) { - printf("couldn't find mapping in /proc/self/maps\n"); - goto out; - } - - /* All three VMAs should be different */ - if (page1.start == page2.start || page2.start == page3.start) { - printf("failed to split VMA for munlock\n"); - goto out; - } - - /* Now unlock the first and third page and check the VMAs again */ - if (munlock(map, page_size * 3)) { - perror("munlock()"); - goto out; - } - - if (get_vm_area((unsigned long)map, &page1) || - get_vm_area((unsigned long)map + page_size, &page2) || - get_vm_area((unsigned long)map + page_size * 2, &page3)) { - printf("couldn't find mapping in /proc/self/maps\n"); - goto out; - } - - /* Now all three VMAs should be the same */ - if (page1.start != page2.start || page2.start != page3.start) { - printf("failed to merge VMAs after munlock\n"); - goto out; - } - - ret = 0; -out: - munmap(map, 3 * page_size); - return ret; -} - -static int test_mlockall(int (test_function)(bool call_mlock)) -{ - int ret = 1; - - if (mlockall(MCL_CURRENT | MCL_ONFAULT | MCL_FUTURE)) { - perror("mlockall"); - return ret; - } - - ret = test_function(false); - munlockall(); - return ret; -} - -int main(int argc, char **argv) -{ - int ret = 0; - ret += test_mlock_lock(); - ret += test_mlock_onfault(); - ret += test_munlockall(); - ret += test_lock_onfault_of_present(); - ret += test_vma_management(true); - ret += test_mlockall(test_vma_management); - return ret; -} --- a/tools/testing/selftests/vm/mlock-random-test.c +++ /dev/null @@ -1,294 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * It tests the mlock/mlock2() when they are invoked - * on randomly memory region. - */ -#include <unistd.h> -#include <sys/resource.h> -#include <sys/capability.h> -#include <sys/mman.h> -#include <fcntl.h> -#include <string.h> -#include <sys/ipc.h> -#include <sys/shm.h> -#include <time.h> -#include "mlock2.h" - -#define CHUNK_UNIT (128 * 1024) -#define MLOCK_RLIMIT_SIZE (CHUNK_UNIT * 2) -#define MLOCK_WITHIN_LIMIT_SIZE CHUNK_UNIT -#define MLOCK_OUTOF_LIMIT_SIZE (CHUNK_UNIT * 3) - -#define TEST_LOOP 100 -#define PAGE_ALIGN(size, ps) (((size) + ((ps) - 1)) & ~((ps) - 1)) - -int set_cap_limits(rlim_t max) -{ - struct rlimit new; - cap_t cap = cap_init(); - - new.rlim_cur = max; - new.rlim_max = max; - if (setrlimit(RLIMIT_MEMLOCK, &new)) { - perror("setrlimit() returns error\n"); - return -1; - } - - /* drop capabilities including CAP_IPC_LOCK */ - if (cap_set_proc(cap)) { - perror("cap_set_proc() returns error\n"); - return -2; - } - - return 0; -} - -int get_proc_locked_vm_size(void) -{ - FILE *f; - int ret = -1; - char line[1024] = {0}; - unsigned long lock_size = 0; - - f = fopen("/proc/self/status", "r"); - if (!f) { - perror("fopen"); - return -1; - } - - while (fgets(line, 1024, f)) { - if (strstr(line, "VmLck")) { - ret = sscanf(line, "VmLck:\t%8lu kB", &lock_size); - if (ret <= 0) { - printf("sscanf() on VmLck error: %s: %d\n", - line, ret); - fclose(f); - return -1; - } - fclose(f); - return (int)(lock_size << 10); - } - } - - perror("cannot parse VmLck in /proc/self/status\n"); - fclose(f); - return -1; -} - -/* - * Get the MMUPageSize of the memory region including input - * address from proc file. - * - * return value: on error case, 0 will be returned. - * Otherwise the page size(in bytes) is returned. - */ -int get_proc_page_size(unsigned long addr) -{ - FILE *smaps; - char *line; - unsigned long mmupage_size = 0; - size_t size; - - smaps = seek_to_smaps_entry(addr); - if (!smaps) { - printf("Unable to parse /proc/self/smaps\n"); - return 0; - } - - while (getline(&line, &size, smaps) > 0) { - if (!strstr(line, "MMUPageSize")) { - free(line); - line = NULL; - size = 0; - continue; - } - - /* found the MMUPageSize of this section */ - if (sscanf(line, "MMUPageSize: %8lu kB", - &mmupage_size) < 1) { - printf("Unable to parse smaps entry for Size:%s\n", - line); - break; - } - - } - free(line); - if (smaps) - fclose(smaps); - return mmupage_size << 10; -} - -/* - * Test mlock/mlock2() on provided memory chunk. - * It expects the mlock/mlock2() to be successful (within rlimit) - * - * With allocated memory chunk [p, p + alloc_size), this - * test will choose start/len randomly to perform mlock/mlock2 - * [start, start + len] memory range. The range is within range - * of the allocated chunk. - * - * The memory region size alloc_size is within the rlimit. - * So we always expect a success of mlock/mlock2. - * - * VmLck is assumed to be 0 before this test. - * - * return value: 0 - success - * else: failure - */ -int test_mlock_within_limit(char *p, int alloc_size) -{ - int i; - int ret = 0; - int locked_vm_size = 0; - struct rlimit cur; - int page_size = 0; - - getrlimit(RLIMIT_MEMLOCK, &cur); - if (cur.rlim_cur < alloc_size) { - printf("alloc_size[%d] < %u rlimit,lead to mlock failure\n", - alloc_size, (unsigned int)cur.rlim_cur); - return -1; - } - - srand(time(NULL)); - for (i = 0; i < TEST_LOOP; i++) { - /* - * - choose mlock/mlock2 randomly - * - choose lock_size randomly but lock_size < alloc_size - * - choose start_offset randomly but p+start_offset+lock_size - * < p+alloc_size - */ - int is_mlock = !!(rand() % 2); - int lock_size = rand() % alloc_size; - int start_offset = rand() % (alloc_size - lock_size); - - if (is_mlock) - ret = mlock(p + start_offset, lock_size); - else - ret = mlock2_(p + start_offset, lock_size, - MLOCK_ONFAULT); - - if (ret) { - printf("%s() failure at |%p(%d)| mlock:|%p(%d)|\n", - is_mlock ? "mlock" : "mlock2", - p, alloc_size, - p + start_offset, lock_size); - return ret; - } - } - - /* - * Check VmLck left by the tests. - */ - locked_vm_size = get_proc_locked_vm_size(); - page_size = get_proc_page_size((unsigned long)p); - if (page_size == 0) { - printf("cannot get proc MMUPageSize\n"); - return -1; - } - - if (locked_vm_size > PAGE_ALIGN(alloc_size, page_size) + page_size) { - printf("test_mlock_within_limit() left VmLck:%d on %d chunk\n", - locked_vm_size, alloc_size); - return -1; - } - - return 0; -} - - -/* - * We expect the mlock/mlock2() to be fail (outof limitation) - * - * With allocated memory chunk [p, p + alloc_size), this - * test will randomly choose start/len and perform mlock/mlock2 - * on [start, start+len] range. - * - * The memory region size alloc_size is above the rlimit. - * And the len to be locked is higher than rlimit. - * So we always expect a failure of mlock/mlock2. - * No locked page number should be increased as a side effect. - * - * return value: 0 - success - * else: failure - */ -int test_mlock_outof_limit(char *p, int alloc_size) -{ - int i; - int ret = 0; - int locked_vm_size = 0, old_locked_vm_size = 0; - struct rlimit cur; - - getrlimit(RLIMIT_MEMLOCK, &cur); - if (cur.rlim_cur >= alloc_size) { - printf("alloc_size[%d] >%u rlimit, violates test condition\n", - alloc_size, (unsigned int)cur.rlim_cur); - return -1; - } - - old_locked_vm_size = get_proc_locked_vm_size(); - srand(time(NULL)); - for (i = 0; i < TEST_LOOP; i++) { - int is_mlock = !!(rand() % 2); - int lock_size = (rand() % (alloc_size - cur.rlim_cur)) - + cur.rlim_cur; - int start_offset = rand() % (alloc_size - lock_size); - - if (is_mlock) - ret = mlock(p + start_offset, lock_size); - else - ret = mlock2_(p + start_offset, lock_size, - MLOCK_ONFAULT); - if (ret == 0) { - printf("%s() succeeds? on %p(%d) mlock%p(%d)\n", - is_mlock ? "mlock" : "mlock2", - p, alloc_size, - p + start_offset, lock_size); - return -1; - } - } - - locked_vm_size = get_proc_locked_vm_size(); - if (locked_vm_size != old_locked_vm_size) { - printf("tests leads to new mlocked page: old[%d], new[%d]\n", - old_locked_vm_size, - locked_vm_size); - return -1; - } - - return 0; -} - -int main(int argc, char **argv) -{ - char *p = NULL; - int ret = 0; - - if (set_cap_limits(MLOCK_RLIMIT_SIZE)) - return -1; - - p = malloc(MLOCK_WITHIN_LIMIT_SIZE); - if (p == NULL) { - perror("malloc() failure\n"); - return -1; - } - ret = test_mlock_within_limit(p, MLOCK_WITHIN_LIMIT_SIZE); - if (ret) - return ret; - munlock(p, MLOCK_WITHIN_LIMIT_SIZE); - free(p); - - - p = malloc(MLOCK_OUTOF_LIMIT_SIZE); - if (p == NULL) { - perror("malloc() failure\n"); - return -1; - } - ret = test_mlock_outof_limit(p, MLOCK_OUTOF_LIMIT_SIZE); - if (ret) - return ret; - munlock(p, MLOCK_OUTOF_LIMIT_SIZE); - free(p); - - return 0; -} --- a/tools/testing/selftests/vm/mrelease_test.c +++ /dev/null @@ -1,206 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 2022 Google LLC - */ -#define _GNU_SOURCE -#include <errno.h> -#include <stdbool.h> -#include <stdio.h> -#include <stdlib.h> -#include <sys/wait.h> -#include <unistd.h> - -#include "util.h" - -#include "../kselftest.h" - -#ifndef __NR_pidfd_open -#define __NR_pidfd_open -1 -#endif - -#ifndef __NR_process_mrelease -#define __NR_process_mrelease -1 -#endif - -#define MB(x) (x << 20) -#define MAX_SIZE_MB 1024 - -static int alloc_noexit(unsigned long nr_pages, int pipefd) -{ - int ppid = getppid(); - int timeout = 10; /* 10sec timeout to get killed */ - unsigned long i; - char *buf; - - buf = (char *)mmap(NULL, nr_pages * PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANON, 0, 0); - if (buf == MAP_FAILED) { - perror("mmap failed, halting the test"); - return KSFT_FAIL; - } - - for (i = 0; i < nr_pages; i++) - *((unsigned long *)(buf + (i * PAGE_SIZE))) = i; - - /* Signal the parent that the child is ready */ - if (write(pipefd, "", 1) < 0) { - perror("write"); - return KSFT_FAIL; - } - - /* Wait to be killed (when reparenting happens) */ - while (getppid() == ppid && timeout > 0) { - sleep(1); - timeout--; - } - - munmap(buf, nr_pages * PAGE_SIZE); - - return (timeout > 0) ? KSFT_PASS : KSFT_FAIL; -} - -/* The process_mrelease calls in this test are expected to fail */ -static void run_negative_tests(int pidfd) -{ - int res; - /* Test invalid flags. Expect to fail with EINVAL error code. */ - if (!syscall(__NR_process_mrelease, pidfd, (unsigned int)-1) || - errno != EINVAL) { - res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); - perror("process_mrelease with wrong flags"); - exit(res); - } - /* - * Test reaping while process is alive with no pending SIGKILL. - * Expect to fail with EINVAL error code. - */ - if (!syscall(__NR_process_mrelease, pidfd, 0) || errno != EINVAL) { - res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); - perror("process_mrelease on a live process"); - exit(res); - } -} - -static int child_main(int pipefd[], size_t size) -{ - int res; - - /* Allocate and fault-in memory and wait to be killed */ - close(pipefd[0]); - res = alloc_noexit(MB(size) / PAGE_SIZE, pipefd[1]); - close(pipefd[1]); - return res; -} - -int main(void) -{ - int pipefd[2], pidfd; - bool success, retry; - size_t size; - pid_t pid; - char byte; - int res; - - /* Test a wrong pidfd */ - if (!syscall(__NR_process_mrelease, -1, 0) || errno != EBADF) { - res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); - perror("process_mrelease with wrong pidfd"); - exit(res); - } - - /* Start the test with 1MB child memory allocation */ - size = 1; -retry: - /* - * Pipe for the child to signal when it's done allocating - * memory - */ - if (pipe(pipefd)) { - perror("pipe"); - exit(KSFT_FAIL); - } - pid = fork(); - if (pid < 0) { - perror("fork"); - close(pipefd[0]); - close(pipefd[1]); - exit(KSFT_FAIL); - } - - if (pid == 0) { - /* Child main routine */ - res = child_main(pipefd, size); - exit(res); - } - - /* - * Parent main routine: - * Wait for the child to finish allocations, then kill and reap - */ - close(pipefd[1]); - /* Block until the child is ready */ - res = read(pipefd[0], &byte, 1); - close(pipefd[0]); - if (res < 0) { - perror("read"); - if (!kill(pid, SIGKILL)) - waitpid(pid, NULL, 0); - exit(KSFT_FAIL); - } - - pidfd = syscall(__NR_pidfd_open, pid, 0); - if (pidfd < 0) { - perror("pidfd_open"); - if (!kill(pid, SIGKILL)) - waitpid(pid, NULL, 0); - exit(KSFT_FAIL); - } - - /* Run negative tests which require a live child */ - run_negative_tests(pidfd); - - if (kill(pid, SIGKILL)) { - res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); - perror("kill"); - exit(res); - } - - success = (syscall(__NR_process_mrelease, pidfd, 0) == 0); - if (!success) { - /* - * If we failed to reap because the child exited too soon, - * before we could call process_mrelease. Double child's memory - * which causes it to spend more time on cleanup and increases - * our chances of reaping its memory before it exits. - * Retry until we succeed or reach MAX_SIZE_MB. - */ - if (errno == ESRCH) { - retry = (size <= MAX_SIZE_MB); - } else { - res = (errno == ENOSYS ? KSFT_SKIP : KSFT_FAIL); - perror("process_mrelease"); - waitpid(pid, NULL, 0); - exit(res); - } - } - - /* Cleanup to prevent zombies */ - if (waitpid(pid, NULL, 0) < 0) { - perror("waitpid"); - exit(KSFT_FAIL); - } - close(pidfd); - - if (!success) { - if (retry) { - size *= 2; - goto retry; - } - printf("All process_mrelease attempts failed!\n"); - exit(KSFT_FAIL); - } - - printf("Success reaping a child with %zuMB of memory allocations\n", - size); - return KSFT_PASS; -} --- a/tools/testing/selftests/vm/mremap_dontunmap.c +++ /dev/null @@ -1,364 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 - -/* - * Tests for mremap w/ MREMAP_DONTUNMAP. - * - * Copyright 2020, Brian Geffon <bgeffon@xxxxxxxxxx> - */ -#define _GNU_SOURCE -#include <sys/mman.h> -#include <errno.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <unistd.h> - -#include "../kselftest.h" - -#ifndef MREMAP_DONTUNMAP -#define MREMAP_DONTUNMAP 4 -#endif - -unsigned long page_size; -char *page_buffer; - -static void dump_maps(void) -{ - char cmd[32]; - - snprintf(cmd, sizeof(cmd), "cat /proc/%d/maps", getpid()); - system(cmd); -} - -#define BUG_ON(condition, description) \ - do { \ - if (condition) { \ - fprintf(stderr, "[FAIL]\t%s():%d\t%s:%s\n", __func__, \ - __LINE__, (description), strerror(errno)); \ - dump_maps(); \ - exit(1); \ - } \ - } while (0) - -// Try a simple operation for to "test" for kernel support this prevents -// reporting tests as failed when it's run on an older kernel. -static int kernel_support_for_mremap_dontunmap() -{ - int ret = 0; - unsigned long num_pages = 1; - void *source_mapping = mmap(NULL, num_pages * page_size, PROT_NONE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(source_mapping == MAP_FAILED, "mmap"); - - // This simple remap should only fail if MREMAP_DONTUNMAP isn't - // supported. - void *dest_mapping = - mremap(source_mapping, num_pages * page_size, num_pages * page_size, - MREMAP_DONTUNMAP | MREMAP_MAYMOVE, 0); - if (dest_mapping == MAP_FAILED) { - ret = errno; - } else { - BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, - "unable to unmap destination mapping"); - } - - BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, - "unable to unmap source mapping"); - return ret; -} - -// This helper will just validate that an entire mapping contains the expected -// byte. -static int check_region_contains_byte(void *addr, unsigned long size, char byte) -{ - BUG_ON(size & (page_size - 1), - "check_region_contains_byte expects page multiples"); - BUG_ON((unsigned long)addr & (page_size - 1), - "check_region_contains_byte expects page alignment"); - - memset(page_buffer, byte, page_size); - - unsigned long num_pages = size / page_size; - unsigned long i; - - // Compare each page checking that it contains our expected byte. - for (i = 0; i < num_pages; ++i) { - int ret = - memcmp(addr + (i * page_size), page_buffer, page_size); - if (ret) { - return ret; - } - } - - return 0; -} - -// this test validates that MREMAP_DONTUNMAP moves the pagetables while leaving -// the source mapping mapped. -static void mremap_dontunmap_simple() -{ - unsigned long num_pages = 5; - - void *source_mapping = - mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(source_mapping == MAP_FAILED, "mmap"); - - memset(source_mapping, 'a', num_pages * page_size); - - // Try to just move the whole mapping anywhere (not fixed). - void *dest_mapping = - mremap(source_mapping, num_pages * page_size, num_pages * page_size, - MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); - BUG_ON(dest_mapping == MAP_FAILED, "mremap"); - - // Validate that the pages have been moved, we know they were moved if - // the dest_mapping contains a's. - BUG_ON(check_region_contains_byte - (dest_mapping, num_pages * page_size, 'a') != 0, - "pages did not migrate"); - BUG_ON(check_region_contains_byte - (source_mapping, num_pages * page_size, 0) != 0, - "source should have no ptes"); - - BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, - "unable to unmap destination mapping"); - BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, - "unable to unmap source mapping"); -} - -// This test validates that MREMAP_DONTUNMAP on a shared mapping works as expected. -static void mremap_dontunmap_simple_shmem() -{ - unsigned long num_pages = 5; - - int mem_fd = memfd_create("memfd", MFD_CLOEXEC); - BUG_ON(mem_fd < 0, "memfd_create"); - - BUG_ON(ftruncate(mem_fd, num_pages * page_size) < 0, - "ftruncate"); - - void *source_mapping = - mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, - MAP_FILE | MAP_SHARED, mem_fd, 0); - BUG_ON(source_mapping == MAP_FAILED, "mmap"); - - BUG_ON(close(mem_fd) < 0, "close"); - - memset(source_mapping, 'a', num_pages * page_size); - - // Try to just move the whole mapping anywhere (not fixed). - void *dest_mapping = - mremap(source_mapping, num_pages * page_size, num_pages * page_size, - MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); - if (dest_mapping == MAP_FAILED && errno == EINVAL) { - // Old kernel which doesn't support MREMAP_DONTUNMAP on shmem. - BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, - "unable to unmap source mapping"); - return; - } - - BUG_ON(dest_mapping == MAP_FAILED, "mremap"); - - // Validate that the pages have been moved, we know they were moved if - // the dest_mapping contains a's. - BUG_ON(check_region_contains_byte - (dest_mapping, num_pages * page_size, 'a') != 0, - "pages did not migrate"); - - // Because the region is backed by shmem, we will actually see the same - // memory at the source location still. - BUG_ON(check_region_contains_byte - (source_mapping, num_pages * page_size, 'a') != 0, - "source should have no ptes"); - - BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, - "unable to unmap destination mapping"); - BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, - "unable to unmap source mapping"); -} - -// This test validates MREMAP_DONTUNMAP will move page tables to a specific -// destination using MREMAP_FIXED, also while validating that the source -// remains intact. -static void mremap_dontunmap_simple_fixed() -{ - unsigned long num_pages = 5; - - // Since we want to guarantee that we can remap to a point, we will - // create a mapping up front. - void *dest_mapping = - mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(dest_mapping == MAP_FAILED, "mmap"); - memset(dest_mapping, 'X', num_pages * page_size); - - void *source_mapping = - mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(source_mapping == MAP_FAILED, "mmap"); - memset(source_mapping, 'a', num_pages * page_size); - - void *remapped_mapping = - mremap(source_mapping, num_pages * page_size, num_pages * page_size, - MREMAP_FIXED | MREMAP_DONTUNMAP | MREMAP_MAYMOVE, - dest_mapping); - BUG_ON(remapped_mapping == MAP_FAILED, "mremap"); - BUG_ON(remapped_mapping != dest_mapping, - "mremap should have placed the remapped mapping at dest_mapping"); - - // The dest mapping will have been unmap by mremap so we expect the Xs - // to be gone and replaced with a's. - BUG_ON(check_region_contains_byte - (dest_mapping, num_pages * page_size, 'a') != 0, - "pages did not migrate"); - - // And the source mapping will have had its ptes dropped. - BUG_ON(check_region_contains_byte - (source_mapping, num_pages * page_size, 0) != 0, - "source should have no ptes"); - - BUG_ON(munmap(dest_mapping, num_pages * page_size) == -1, - "unable to unmap destination mapping"); - BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, - "unable to unmap source mapping"); -} - -// This test validates that we can MREMAP_DONTUNMAP for a portion of an -// existing mapping. -static void mremap_dontunmap_partial_mapping() -{ - /* - * source mapping: - * -------------- - * | aaaaaaaaaa | - * -------------- - * to become: - * -------------- - * | aaaaa00000 | - * -------------- - * With the destination mapping containing 5 pages of As. - * --------- - * | aaaaa | - * --------- - */ - unsigned long num_pages = 10; - void *source_mapping = - mmap(NULL, num_pages * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(source_mapping == MAP_FAILED, "mmap"); - memset(source_mapping, 'a', num_pages * page_size); - - // We will grab the last 5 pages of the source and move them. - void *dest_mapping = - mremap(source_mapping + (5 * page_size), 5 * page_size, - 5 * page_size, - MREMAP_DONTUNMAP | MREMAP_MAYMOVE, NULL); - BUG_ON(dest_mapping == MAP_FAILED, "mremap"); - - // We expect the first 5 pages of the source to contain a's and the - // final 5 pages to contain zeros. - BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 'a') != - 0, "first 5 pages of source should have original pages"); - BUG_ON(check_region_contains_byte - (source_mapping + (5 * page_size), 5 * page_size, 0) != 0, - "final 5 pages of source should have no ptes"); - - // Finally we expect the destination to have 5 pages worth of a's. - BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') != - 0, "dest mapping should contain ptes from the source"); - - BUG_ON(munmap(dest_mapping, 5 * page_size) == -1, - "unable to unmap destination mapping"); - BUG_ON(munmap(source_mapping, num_pages * page_size) == -1, - "unable to unmap source mapping"); -} - -// This test validates that we can remap over only a portion of a mapping. -static void mremap_dontunmap_partial_mapping_overwrite(void) -{ - /* - * source mapping: - * --------- - * |aaaaa| - * --------- - * dest mapping initially: - * ----------- - * |XXXXXXXXXX| - * ------------ - * Source to become: - * --------- - * |00000| - * --------- - * With the destination mapping containing 5 pages of As. - * ------------ - * |aaaaaXXXXX| - * ------------ - */ - void *source_mapping = - mmap(NULL, 5 * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(source_mapping == MAP_FAILED, "mmap"); - memset(source_mapping, 'a', 5 * page_size); - - void *dest_mapping = - mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(dest_mapping == MAP_FAILED, "mmap"); - memset(dest_mapping, 'X', 10 * page_size); - - // We will grab the last 5 pages of the source and move them. - void *remapped_mapping = - mremap(source_mapping, 5 * page_size, - 5 * page_size, - MREMAP_DONTUNMAP | MREMAP_MAYMOVE | MREMAP_FIXED, dest_mapping); - BUG_ON(dest_mapping == MAP_FAILED, "mremap"); - BUG_ON(dest_mapping != remapped_mapping, "expected to remap to dest_mapping"); - - BUG_ON(check_region_contains_byte(source_mapping, 5 * page_size, 0) != - 0, "first 5 pages of source should have no ptes"); - - // Finally we expect the destination to have 5 pages worth of a's. - BUG_ON(check_region_contains_byte(dest_mapping, 5 * page_size, 'a') != 0, - "dest mapping should contain ptes from the source"); - - // Finally the last 5 pages shouldn't have been touched. - BUG_ON(check_region_contains_byte(dest_mapping + (5 * page_size), - 5 * page_size, 'X') != 0, - "dest mapping should have retained the last 5 pages"); - - BUG_ON(munmap(dest_mapping, 10 * page_size) == -1, - "unable to unmap destination mapping"); - BUG_ON(munmap(source_mapping, 5 * page_size) == -1, - "unable to unmap source mapping"); -} - -int main(void) -{ - page_size = sysconf(_SC_PAGE_SIZE); - - // test for kernel support for MREMAP_DONTUNMAP skipping the test if - // not. - if (kernel_support_for_mremap_dontunmap() != 0) { - printf("No kernel support for MREMAP_DONTUNMAP\n"); - return KSFT_SKIP; - } - - // Keep a page sized buffer around for when we need it. - page_buffer = - mmap(NULL, page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - BUG_ON(page_buffer == MAP_FAILED, "unable to mmap a page."); - - mremap_dontunmap_simple(); - mremap_dontunmap_simple_shmem(); - mremap_dontunmap_simple_fixed(); - mremap_dontunmap_partial_mapping(); - mremap_dontunmap_partial_mapping_overwrite(); - - BUG_ON(munmap(page_buffer, page_size) == -1, - "unable to unmap page buffer"); - - printf("OK\n"); - return 0; -} --- a/tools/testing/selftests/vm/mremap_test.c +++ /dev/null @@ -1,550 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 2020 Google LLC - */ -#define _GNU_SOURCE - -#include <errno.h> -#include <stdlib.h> -#include <stdio.h> -#include <string.h> -#include <sys/mman.h> -#include <time.h> -#include <stdbool.h> - -#include "../kselftest.h" - -#define EXPECT_SUCCESS 0 -#define EXPECT_FAILURE 1 -#define NON_OVERLAPPING 0 -#define OVERLAPPING 1 -#define NS_PER_SEC 1000000000ULL -#define VALIDATION_DEFAULT_THRESHOLD 4 /* 4MB */ -#define VALIDATION_NO_THRESHOLD 0 /* Verify the entire region */ - -#define MIN(X, Y) ((X) < (Y) ? (X) : (Y)) - -struct config { - unsigned long long src_alignment; - unsigned long long dest_alignment; - unsigned long long region_size; - int overlapping; -}; - -struct test { - const char *name; - struct config config; - int expect_failure; -}; - -enum { - _1KB = 1ULL << 10, /* 1KB -> not page aligned */ - _4KB = 4ULL << 10, - _8KB = 8ULL << 10, - _1MB = 1ULL << 20, - _2MB = 2ULL << 20, - _4MB = 4ULL << 20, - _1GB = 1ULL << 30, - _2GB = 2ULL << 30, - PMD = _2MB, - PUD = _1GB, -}; - -#define PTE page_size - -#define MAKE_TEST(source_align, destination_align, size, \ - overlaps, should_fail, test_name) \ -(struct test){ \ - .name = test_name, \ - .config = { \ - .src_alignment = source_align, \ - .dest_alignment = destination_align, \ - .region_size = size, \ - .overlapping = overlaps, \ - }, \ - .expect_failure = should_fail \ -} - -/* - * Returns false if the requested remap region overlaps with an - * existing mapping (e.g text, stack) else returns true. - */ -static bool is_remap_region_valid(void *addr, unsigned long long size) -{ - void *remap_addr = NULL; - bool ret = true; - - /* Use MAP_FIXED_NOREPLACE flag to ensure region is not mapped */ - remap_addr = mmap(addr, size, PROT_READ | PROT_WRITE, - MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, - -1, 0); - - if (remap_addr == MAP_FAILED) { - if (errno == EEXIST) - ret = false; - } else { - munmap(remap_addr, size); - } - - return ret; -} - -/* Returns mmap_min_addr sysctl tunable from procfs */ -static unsigned long long get_mmap_min_addr(void) -{ - FILE *fp; - int n_matched; - static unsigned long long addr; - - if (addr) - return addr; - - fp = fopen("/proc/sys/vm/mmap_min_addr", "r"); - if (fp == NULL) { - ksft_print_msg("Failed to open /proc/sys/vm/mmap_min_addr: %s\n", - strerror(errno)); - exit(KSFT_SKIP); - } - - n_matched = fscanf(fp, "%llu", &addr); - if (n_matched != 1) { - ksft_print_msg("Failed to read /proc/sys/vm/mmap_min_addr: %s\n", - strerror(errno)); - fclose(fp); - exit(KSFT_SKIP); - } - - fclose(fp); - return addr; -} - -/* - * Using /proc/self/maps, assert that the specified address range is contained - * within a single mapping. - */ -static bool is_range_mapped(void *start, void *end) -{ - FILE *fp; - char *line = NULL; - size_t len = 0; - bool success = false; - - fp = fopen("/proc/self/maps", "r"); - if (fp == NULL) - return false; - - while (getline(&line, &len, fp) != -1) { - char *first = strtok(line, "- "); - void *first_val = (void *)strtol(first, NULL, 16); - char *second = strtok(NULL, "- "); - void *second_val = (void *) strtol(second, NULL, 16); - - if (first_val <= start && second_val >= end) { - success = true; - break; - } - } - - fclose(fp); - return success; -} - -/* - * This test validates that merge is called when expanding a mapping. - * Mapping containing three pages is created, middle page is unmapped - * and then the mapping containing the first page is expanded so that - * it fills the created hole. The two parts should merge creating - * single mapping with three pages. - */ -static void mremap_expand_merge(unsigned long page_size) -{ - char *test_name = "mremap expand merge"; - bool success = false; - int errsv = 0; - char *remap; - char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (start == MAP_FAILED) { - errsv = errno; - goto error; - } - - munmap(start + page_size, page_size); - remap = mremap(start, page_size, 2 * page_size, 0); - if (remap == MAP_FAILED) { - errsv = errno; - munmap(start, page_size); - munmap(start + 2 * page_size, page_size); - goto error; - } - - success = is_range_mapped(start, start + 3 * page_size); - - munmap(start, 3 * page_size); - goto out; - -error: - ksft_print_msg("Unexpected mapping/remapping error: %s\n", - strerror(errsv)); -out: - if (success) - ksft_test_result_pass("%s\n", test_name); - else - ksft_test_result_fail("%s\n", test_name); -} - -/* - * Similar to mremap_expand_merge() except instead of removing the middle page, - * we remove the last then attempt to remap offset from the second page. This - * should result in the mapping being restored to its former state. - */ -static void mremap_expand_merge_offset(unsigned long page_size) -{ - - char *test_name = "mremap expand merge offset"; - bool success = false; - int errsv = 0; - char *remap; - char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (start == MAP_FAILED) { - errsv = errno; - goto error; - } - - /* Unmap final page to ensure we have space to expand. */ - munmap(start + 2 * page_size, page_size); - remap = mremap(start + page_size, page_size, 2 * page_size, 0); - if (remap == MAP_FAILED) { - errsv = errno; - munmap(start, 2 * page_size); - goto error; - } - - success = is_range_mapped(start, start + 3 * page_size); - goto out; - -error: - ksft_print_msg("Unexpected mapping/remapping error: %s\n", - strerror(errsv)); -out: - if (success) - ksft_test_result_pass("%s\n", test_name); - else - ksft_test_result_fail("%s\n", test_name); -} - -/* - * Returns the start address of the mapping on success, else returns - * NULL on failure. - */ -static void *get_source_mapping(struct config c) -{ - unsigned long long addr = 0ULL; - void *src_addr = NULL; - unsigned long long mmap_min_addr; - - mmap_min_addr = get_mmap_min_addr(); - -retry: - addr += c.src_alignment; - if (addr < mmap_min_addr) - goto retry; - - src_addr = mmap((void *) addr, c.region_size, PROT_READ | PROT_WRITE, - MAP_FIXED_NOREPLACE | MAP_ANONYMOUS | MAP_SHARED, - -1, 0); - if (src_addr == MAP_FAILED) { - if (errno == EPERM || errno == EEXIST) - goto retry; - goto error; - } - /* - * Check that the address is aligned to the specified alignment. - * Addresses which have alignments that are multiples of that - * specified are not considered valid. For instance, 1GB address is - * 2MB-aligned, however it will not be considered valid for a - * requested alignment of 2MB. This is done to reduce coincidental - * alignment in the tests. - */ - if (((unsigned long long) src_addr & (c.src_alignment - 1)) || - !((unsigned long long) src_addr & c.src_alignment)) { - munmap(src_addr, c.region_size); - goto retry; - } - - if (!src_addr) - goto error; - - return src_addr; -error: - ksft_print_msg("Failed to map source region: %s\n", - strerror(errno)); - return NULL; -} - -/* Returns the time taken for the remap on success else returns -1. */ -static long long remap_region(struct config c, unsigned int threshold_mb, - char pattern_seed) -{ - void *addr, *src_addr, *dest_addr; - unsigned long long i; - struct timespec t_start = {0, 0}, t_end = {0, 0}; - long long start_ns, end_ns, align_mask, ret, offset; - unsigned long long threshold; - - if (threshold_mb == VALIDATION_NO_THRESHOLD) - threshold = c.region_size; - else - threshold = MIN(threshold_mb * _1MB, c.region_size); - - src_addr = get_source_mapping(c); - if (!src_addr) { - ret = -1; - goto out; - } - - /* Set byte pattern */ - srand(pattern_seed); - for (i = 0; i < threshold; i++) - memset((char *) src_addr + i, (char) rand(), 1); - - /* Mask to zero out lower bits of address for alignment */ - align_mask = ~(c.dest_alignment - 1); - /* Offset of destination address from the end of the source region */ - offset = (c.overlapping) ? -c.dest_alignment : c.dest_alignment; - addr = (void *) (((unsigned long long) src_addr + c.region_size - + offset) & align_mask); - - /* See comment in get_source_mapping() */ - if (!((unsigned long long) addr & c.dest_alignment)) - addr = (void *) ((unsigned long long) addr | c.dest_alignment); - - /* Don't destroy existing mappings unless expected to overlap */ - while (!is_remap_region_valid(addr, c.region_size) && !c.overlapping) { - /* Check for unsigned overflow */ - if (addr + c.dest_alignment < addr) { - ksft_print_msg("Couldn't find a valid region to remap to\n"); - ret = -1; - goto out; - } - addr += c.dest_alignment; - } - - clock_gettime(CLOCK_MONOTONIC, &t_start); - dest_addr = mremap(src_addr, c.region_size, c.region_size, - MREMAP_MAYMOVE|MREMAP_FIXED, (char *) addr); - clock_gettime(CLOCK_MONOTONIC, &t_end); - - if (dest_addr == MAP_FAILED) { - ksft_print_msg("mremap failed: %s\n", strerror(errno)); - ret = -1; - goto clean_up_src; - } - - /* Verify byte pattern after remapping */ - srand(pattern_seed); - for (i = 0; i < threshold; i++) { - char c = (char) rand(); - - if (((char *) dest_addr)[i] != c) { - ksft_print_msg("Data after remap doesn't match at offset %d\n", - i); - ksft_print_msg("Expected: %#x\t Got: %#x\n", c & 0xff, - ((char *) dest_addr)[i] & 0xff); - ret = -1; - goto clean_up_dest; - } - } - - start_ns = t_start.tv_sec * NS_PER_SEC + t_start.tv_nsec; - end_ns = t_end.tv_sec * NS_PER_SEC + t_end.tv_nsec; - ret = end_ns - start_ns; - -/* - * Since the destination address is specified using MREMAP_FIXED, subsequent - * mremap will unmap any previous mapping at the address range specified by - * dest_addr and region_size. This significantly affects the remap time of - * subsequent tests. So we clean up mappings after each test. - */ -clean_up_dest: - munmap(dest_addr, c.region_size); -clean_up_src: - munmap(src_addr, c.region_size); -out: - return ret; -} - -static void run_mremap_test_case(struct test test_case, int *failures, - unsigned int threshold_mb, - unsigned int pattern_seed) -{ - long long remap_time = remap_region(test_case.config, threshold_mb, - pattern_seed); - - if (remap_time < 0) { - if (test_case.expect_failure) - ksft_test_result_xfail("%s\n\tExpected mremap failure\n", - test_case.name); - else { - ksft_test_result_fail("%s\n", test_case.name); - *failures += 1; - } - } else { - /* - * Comparing mremap time is only applicable if entire region - * was faulted in. - */ - if (threshold_mb == VALIDATION_NO_THRESHOLD || - test_case.config.region_size <= threshold_mb * _1MB) - ksft_test_result_pass("%s\n\tmremap time: %12lldns\n", - test_case.name, remap_time); - else - ksft_test_result_pass("%s\n", test_case.name); - } -} - -static void usage(const char *cmd) -{ - fprintf(stderr, - "Usage: %s [[-t <threshold_mb>] [-p <pattern_seed>]]\n" - "-t\t only validate threshold_mb of the remapped region\n" - " \t if 0 is supplied no threshold is used; all tests\n" - " \t are run and remapped regions validated fully.\n" - " \t The default threshold used is 4MB.\n" - "-p\t provide a seed to generate the random pattern for\n" - " \t validating the remapped region.\n", cmd); -} - -static int parse_args(int argc, char **argv, unsigned int *threshold_mb, - unsigned int *pattern_seed) -{ - const char *optstr = "t:p:"; - int opt; - - while ((opt = getopt(argc, argv, optstr)) != -1) { - switch (opt) { - case 't': - *threshold_mb = atoi(optarg); - break; - case 'p': - *pattern_seed = atoi(optarg); - break; - default: - usage(argv[0]); - return -1; - } - } - - if (optind < argc) { - usage(argv[0]); - return -1; - } - - return 0; -} - -#define MAX_TEST 13 -#define MAX_PERF_TEST 3 -int main(int argc, char **argv) -{ - int failures = 0; - int i, run_perf_tests; - unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; - unsigned int pattern_seed; - int num_expand_tests = 1; - struct test test_cases[MAX_TEST]; - struct test perf_test_cases[MAX_PERF_TEST]; - int page_size; - time_t t; - - pattern_seed = (unsigned int) time(&t); - - if (parse_args(argc, argv, &threshold_mb, &pattern_seed) < 0) - exit(EXIT_FAILURE); - - ksft_print_msg("Test configs:\n\tthreshold_mb=%u\n\tpattern_seed=%u\n\n", - threshold_mb, pattern_seed); - - page_size = sysconf(_SC_PAGESIZE); - - /* Expected mremap failures */ - test_cases[0] = MAKE_TEST(page_size, page_size, page_size, - OVERLAPPING, EXPECT_FAILURE, - "mremap - Source and Destination Regions Overlapping"); - - test_cases[1] = MAKE_TEST(page_size, page_size/4, page_size, - NON_OVERLAPPING, EXPECT_FAILURE, - "mremap - Destination Address Misaligned (1KB-aligned)"); - test_cases[2] = MAKE_TEST(page_size/4, page_size, page_size, - NON_OVERLAPPING, EXPECT_FAILURE, - "mremap - Source Address Misaligned (1KB-aligned)"); - - /* Src addr PTE aligned */ - test_cases[3] = MAKE_TEST(PTE, PTE, PTE * 2, - NON_OVERLAPPING, EXPECT_SUCCESS, - "8KB mremap - Source PTE-aligned, Destination PTE-aligned"); - - /* Src addr 1MB aligned */ - test_cases[4] = MAKE_TEST(_1MB, PTE, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2MB mremap - Source 1MB-aligned, Destination PTE-aligned"); - test_cases[5] = MAKE_TEST(_1MB, _1MB, _2MB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2MB mremap - Source 1MB-aligned, Destination 1MB-aligned"); - - /* Src addr PMD aligned */ - test_cases[6] = MAKE_TEST(PMD, PTE, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, - "4MB mremap - Source PMD-aligned, Destination PTE-aligned"); - test_cases[7] = MAKE_TEST(PMD, _1MB, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, - "4MB mremap - Source PMD-aligned, Destination 1MB-aligned"); - test_cases[8] = MAKE_TEST(PMD, PMD, _4MB, NON_OVERLAPPING, EXPECT_SUCCESS, - "4MB mremap - Source PMD-aligned, Destination PMD-aligned"); - - /* Src addr PUD aligned */ - test_cases[9] = MAKE_TEST(PUD, PTE, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2GB mremap - Source PUD-aligned, Destination PTE-aligned"); - test_cases[10] = MAKE_TEST(PUD, _1MB, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2GB mremap - Source PUD-aligned, Destination 1MB-aligned"); - test_cases[11] = MAKE_TEST(PUD, PMD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2GB mremap - Source PUD-aligned, Destination PMD-aligned"); - test_cases[12] = MAKE_TEST(PUD, PUD, _2GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "2GB mremap - Source PUD-aligned, Destination PUD-aligned"); - - perf_test_cases[0] = MAKE_TEST(page_size, page_size, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "1GB mremap - Source PTE-aligned, Destination PTE-aligned"); - /* - * mremap 1GB region - Page table level aligned time - * comparison. - */ - perf_test_cases[1] = MAKE_TEST(PMD, PMD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "1GB mremap - Source PMD-aligned, Destination PMD-aligned"); - perf_test_cases[2] = MAKE_TEST(PUD, PUD, _1GB, NON_OVERLAPPING, EXPECT_SUCCESS, - "1GB mremap - Source PUD-aligned, Destination PUD-aligned"); - - run_perf_tests = (threshold_mb == VALIDATION_NO_THRESHOLD) || - (threshold_mb * _1MB >= _1GB); - - ksft_set_plan(ARRAY_SIZE(test_cases) + (run_perf_tests ? - ARRAY_SIZE(perf_test_cases) : 0) + num_expand_tests); - - for (i = 0; i < ARRAY_SIZE(test_cases); i++) - run_mremap_test_case(test_cases[i], &failures, threshold_mb, - pattern_seed); - - mremap_expand_merge(page_size); - mremap_expand_merge_offset(page_size); - - if (run_perf_tests) { - ksft_print_msg("\n%s\n", - "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:"); - for (i = 0; i < ARRAY_SIZE(perf_test_cases); i++) - run_mremap_test_case(perf_test_cases[i], &failures, - threshold_mb, pattern_seed); - } - - if (failures > 0) - ksft_exit_fail(); - else - ksft_exit_pass(); -} --- a/tools/testing/selftests/vm/on-fault-limit.c +++ /dev/null @@ -1,48 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <sys/mman.h> -#include <stdio.h> -#include <unistd.h> -#include <string.h> -#include <sys/time.h> -#include <sys/resource.h> - -#ifndef MCL_ONFAULT -#define MCL_ONFAULT (MCL_FUTURE << 1) -#endif - -static int test_limit(void) -{ - int ret = 1; - struct rlimit lims; - void *map; - - if (getrlimit(RLIMIT_MEMLOCK, &lims)) { - perror("getrlimit"); - return ret; - } - - if (mlockall(MCL_ONFAULT | MCL_FUTURE)) { - perror("mlockall"); - return ret; - } - - map = mmap(NULL, 2 * lims.rlim_max, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0); - if (map != MAP_FAILED) - printf("mmap should have failed, but didn't\n"); - else { - ret = 0; - munmap(map, 2 * lims.rlim_max); - } - - munlockall(); - return ret; -} - -int main(int argc, char **argv) -{ - int ret = 0; - - ret += test_limit(); - return ret; -} --- a/tools/testing/selftests/vm/pkey-helpers.h +++ /dev/null @@ -1,226 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _PKEYS_HELPER_H -#define _PKEYS_HELPER_H -#define _GNU_SOURCE -#include <string.h> -#include <stdarg.h> -#include <stdio.h> -#include <stdint.h> -#include <stdbool.h> -#include <signal.h> -#include <assert.h> -#include <stdlib.h> -#include <ucontext.h> -#include <sys/mman.h> - -#include "../kselftest.h" - -/* Define some kernel-like types */ -#define u8 __u8 -#define u16 __u16 -#define u32 __u32 -#define u64 __u64 - -#define PTR_ERR_ENOTSUP ((void *)-ENOTSUP) - -#ifndef DEBUG_LEVEL -#define DEBUG_LEVEL 0 -#endif -#define DPRINT_IN_SIGNAL_BUF_SIZE 4096 -extern int dprint_in_signal; -extern char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; - -extern int test_nr; -extern int iteration_nr; - -#ifdef __GNUC__ -__attribute__((format(printf, 1, 2))) -#endif -static inline void sigsafe_printf(const char *format, ...) -{ - va_list ap; - - if (!dprint_in_signal) { - va_start(ap, format); - vprintf(format, ap); - va_end(ap); - } else { - int ret; - /* - * No printf() functions are signal-safe. - * They deadlock easily. Write the format - * string to get some output, even if - * incomplete. - */ - ret = write(1, format, strlen(format)); - if (ret < 0) - exit(1); - } -} -#define dprintf_level(level, args...) do { \ - if (level <= DEBUG_LEVEL) \ - sigsafe_printf(args); \ -} while (0) -#define dprintf0(args...) dprintf_level(0, args) -#define dprintf1(args...) dprintf_level(1, args) -#define dprintf2(args...) dprintf_level(2, args) -#define dprintf3(args...) dprintf_level(3, args) -#define dprintf4(args...) dprintf_level(4, args) - -extern void abort_hooks(void); -#define pkey_assert(condition) do { \ - if (!(condition)) { \ - dprintf0("assert() at %s::%d test_nr: %d iteration: %d\n", \ - __FILE__, __LINE__, \ - test_nr, iteration_nr); \ - dprintf0("errno at assert: %d", errno); \ - abort_hooks(); \ - exit(__LINE__); \ - } \ -} while (0) - -__attribute__((noinline)) int read_ptr(int *ptr); -void expected_pkey_fault(int pkey); -int sys_pkey_alloc(unsigned long flags, unsigned long init_val); -int sys_pkey_free(unsigned long pkey); -int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, - unsigned long pkey); -void record_pkey_malloc(void *ptr, long size, int prot); - -#if defined(__i386__) || defined(__x86_64__) /* arch */ -#include "pkey-x86.h" -#elif defined(__powerpc64__) /* arch */ -#include "pkey-powerpc.h" -#else /* arch */ -#error Architecture not supported -#endif /* arch */ - -#define PKEY_MASK (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE) - -static inline u64 set_pkey_bits(u64 reg, int pkey, u64 flags) -{ - u32 shift = pkey_bit_position(pkey); - /* mask out bits from pkey in old value */ - reg &= ~((u64)PKEY_MASK << shift); - /* OR in new bits for pkey */ - reg |= (flags & PKEY_MASK) << shift; - return reg; -} - -static inline u64 get_pkey_bits(u64 reg, int pkey) -{ - u32 shift = pkey_bit_position(pkey); - /* - * shift down the relevant bits to the lowest two, then - * mask off all the other higher bits - */ - return ((reg >> shift) & PKEY_MASK); -} - -extern u64 shadow_pkey_reg; - -static inline u64 _read_pkey_reg(int line) -{ - u64 pkey_reg = __read_pkey_reg(); - - dprintf4("read_pkey_reg(line=%d) pkey_reg: %016llx" - " shadow: %016llx\n", - line, pkey_reg, shadow_pkey_reg); - assert(pkey_reg == shadow_pkey_reg); - - return pkey_reg; -} - -#define read_pkey_reg() _read_pkey_reg(__LINE__) - -static inline void write_pkey_reg(u64 pkey_reg) -{ - dprintf4("%s() changing %016llx to %016llx\n", __func__, - __read_pkey_reg(), pkey_reg); - /* will do the shadow check for us: */ - read_pkey_reg(); - __write_pkey_reg(pkey_reg); - shadow_pkey_reg = pkey_reg; - dprintf4("%s(%016llx) pkey_reg: %016llx\n", __func__, - pkey_reg, __read_pkey_reg()); -} - -/* - * These are technically racy. since something could - * change PKEY register between the read and the write. - */ -static inline void __pkey_access_allow(int pkey, int do_allow) -{ - u64 pkey_reg = read_pkey_reg(); - int bit = pkey * 2; - - if (do_allow) - pkey_reg &= (1<<bit); - else - pkey_reg |= (1<<bit); - - dprintf4("pkey_reg now: %016llx\n", read_pkey_reg()); - write_pkey_reg(pkey_reg); -} - -static inline void __pkey_write_allow(int pkey, int do_allow_write) -{ - u64 pkey_reg = read_pkey_reg(); - int bit = pkey * 2 + 1; - - if (do_allow_write) - pkey_reg &= (1<<bit); - else - pkey_reg |= (1<<bit); - - write_pkey_reg(pkey_reg); - dprintf4("pkey_reg now: %016llx\n", read_pkey_reg()); -} - -#define ALIGN_UP(x, align_to) (((x) + ((align_to)-1)) & ~((align_to)-1)) -#define ALIGN_DOWN(x, align_to) ((x) & ~((align_to)-1)) -#define ALIGN_PTR_UP(p, ptr_align_to) \ - ((typeof(p))ALIGN_UP((unsigned long)(p), ptr_align_to)) -#define ALIGN_PTR_DOWN(p, ptr_align_to) \ - ((typeof(p))ALIGN_DOWN((unsigned long)(p), ptr_align_to)) -#define __stringify_1(x...) #x -#define __stringify(x...) __stringify_1(x) - -static inline u32 *siginfo_get_pkey_ptr(siginfo_t *si) -{ -#ifdef si_pkey - return &si->si_pkey; -#else - return (u32 *)(((u8 *)si) + si_pkey_offset); -#endif -} - -static inline int kernel_has_pkeys(void) -{ - /* try allocating a key and see if it succeeds */ - int ret = sys_pkey_alloc(0, 0); - if (ret <= 0) { - return 0; - } - sys_pkey_free(ret); - return 1; -} - -static inline int is_pkeys_supported(void) -{ - /* check if the cpu supports pkeys */ - if (!cpu_has_pkeys()) { - dprintf1("SKIP: %s: no CPU support\n", __func__); - return 0; - } - - /* check if the kernel supports pkeys */ - if (!kernel_has_pkeys()) { - dprintf1("SKIP: %s: no kernel support\n", __func__); - return 0; - } - - return 1; -} - -#endif /* _PKEYS_HELPER_H */ --- a/tools/testing/selftests/vm/pkey-powerpc.h +++ /dev/null @@ -1,133 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _PKEYS_POWERPC_H -#define _PKEYS_POWERPC_H - -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 386 -#endif -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 384 -# define SYS_pkey_free 385 -#endif -#define REG_IP_IDX PT_NIP -#define REG_TRAPNO PT_TRAP -#define gregs gp_regs -#define fpregs fp_regs -#define si_pkey_offset 0x20 - -#undef PKEY_DISABLE_ACCESS -#define PKEY_DISABLE_ACCESS 0x3 /* disable read and write */ - -#undef PKEY_DISABLE_WRITE -#define PKEY_DISABLE_WRITE 0x2 - -#define NR_PKEYS 32 -#define NR_RESERVED_PKEYS_4K 27 /* pkey-0, pkey-1, exec-only-pkey - and 24 other keys that cannot be - represented in the PTE */ -#define NR_RESERVED_PKEYS_64K_3KEYS 3 /* PowerNV and KVM: pkey-0, - pkey-1 and exec-only key */ -#define NR_RESERVED_PKEYS_64K_4KEYS 4 /* PowerVM: pkey-0, pkey-1, - pkey-31 and exec-only key */ -#define PKEY_BITS_PER_PKEY 2 -#define HPAGE_SIZE (1UL << 24) -#define PAGE_SIZE sysconf(_SC_PAGESIZE) - -static inline u32 pkey_bit_position(int pkey) -{ - return (NR_PKEYS - pkey - 1) * PKEY_BITS_PER_PKEY; -} - -static inline u64 __read_pkey_reg(void) -{ - u64 pkey_reg; - - asm volatile("mfspr %0, 0xd" : "=r" (pkey_reg)); - - return pkey_reg; -} - -static inline void __write_pkey_reg(u64 pkey_reg) -{ - u64 amr = pkey_reg; - - dprintf4("%s() changing %016llx to %016llx\n", - __func__, __read_pkey_reg(), pkey_reg); - - asm volatile("isync; mtspr 0xd, %0; isync" - : : "r" ((unsigned long)(amr)) : "memory"); - - dprintf4("%s() pkey register after changing %016llx to %016llx\n", - __func__, __read_pkey_reg(), pkey_reg); -} - -static inline int cpu_has_pkeys(void) -{ - /* No simple way to determine this */ - return 1; -} - -static inline bool arch_is_powervm() -{ - struct stat buf; - - if ((stat("/sys/firmware/devicetree/base/ibm,partition-name", &buf) == 0) && - (stat("/sys/firmware/devicetree/base/hmc-managed?", &buf) == 0) && - (stat("/sys/firmware/devicetree/base/chosen/qemu,graphic-width", &buf) == -1) ) - return true; - - return false; -} - -static inline int get_arch_reserved_keys(void) -{ - if (sysconf(_SC_PAGESIZE) == 4096) - return NR_RESERVED_PKEYS_4K; - else - if (arch_is_powervm()) - return NR_RESERVED_PKEYS_64K_4KEYS; - else - return NR_RESERVED_PKEYS_64K_3KEYS; -} - -void expect_fault_on_read_execonly_key(void *p1, int pkey) -{ - /* - * powerpc does not allow userspace to change permissions of exec-only - * keys since those keys are not allocated by userspace. The signal - * handler wont be able to reset the permissions, which means the code - * will infinitely continue to segfault here. - */ - return; -} - -/* 4-byte instructions * 16384 = 64K page */ -#define __page_o_noops() asm(".rept 16384 ; nop; .endr") - -void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) -{ - void *ptr; - int ret; - - dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, - size, prot, pkey); - pkey_assert(pkey < NR_PKEYS); - ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - pkey_assert(ptr != (void *)-1); - - ret = syscall(__NR_subpage_prot, ptr, size, NULL); - if (ret) { - perror("subpage_perm"); - return PTR_ERR_ENOTSUP; - } - - ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); - pkey_assert(!ret); - record_pkey_malloc(ptr, size, prot); - - dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); - return ptr; -} - -#endif /* _PKEYS_POWERPC_H */ --- a/tools/testing/selftests/vm/pkey-x86.h +++ /dev/null @@ -1,177 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef _PKEYS_X86_H -#define _PKEYS_X86_H - -#ifdef __i386__ - -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 380 -#endif - -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 381 -# define SYS_pkey_free 382 -#endif - -#define REG_IP_IDX REG_EIP -#define si_pkey_offset 0x14 - -#else - -#ifndef SYS_mprotect_key -# define SYS_mprotect_key 329 -#endif - -#ifndef SYS_pkey_alloc -# define SYS_pkey_alloc 330 -# define SYS_pkey_free 331 -#endif - -#define REG_IP_IDX REG_RIP -#define si_pkey_offset 0x20 - -#endif - -#ifndef PKEY_DISABLE_ACCESS -# define PKEY_DISABLE_ACCESS 0x1 -#endif - -#ifndef PKEY_DISABLE_WRITE -# define PKEY_DISABLE_WRITE 0x2 -#endif - -#define NR_PKEYS 16 -#define NR_RESERVED_PKEYS 2 /* pkey-0 and exec-only-pkey */ -#define PKEY_BITS_PER_PKEY 2 -#define HPAGE_SIZE (1UL<<21) -#define PAGE_SIZE 4096 -#define MB (1<<20) - -static inline void __page_o_noops(void) -{ - /* 8-bytes of instruction * 512 bytes = 1 page */ - asm(".rept 512 ; nopl 0x7eeeeeee(%eax) ; .endr"); -} - -static inline u64 __read_pkey_reg(void) -{ - unsigned int eax, edx; - unsigned int ecx = 0; - unsigned pkey_reg; - - asm volatile(".byte 0x0f,0x01,0xee\n\t" - : "=a" (eax), "=d" (edx) - : "c" (ecx)); - pkey_reg = eax; - return pkey_reg; -} - -static inline void __write_pkey_reg(u64 pkey_reg) -{ - unsigned int eax = pkey_reg; - unsigned int ecx = 0; - unsigned int edx = 0; - - dprintf4("%s() changing %016llx to %016llx\n", __func__, - __read_pkey_reg(), pkey_reg); - asm volatile(".byte 0x0f,0x01,0xef\n\t" - : : "a" (eax), "c" (ecx), "d" (edx)); - assert(pkey_reg == __read_pkey_reg()); -} - -/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx) */ -#define X86_FEATURE_PKU (1<<3) /* Protection Keys for Userspace */ -#define X86_FEATURE_OSPKE (1<<4) /* OS Protection Keys Enable */ - -static inline int cpu_has_pkeys(void) -{ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - - __cpuid_count(0x7, 0x0, eax, ebx, ecx, edx); - - if (!(ecx & X86_FEATURE_PKU)) { - dprintf2("cpu does not have PKU\n"); - return 0; - } - if (!(ecx & X86_FEATURE_OSPKE)) { - dprintf2("cpu does not have OSPKE\n"); - return 0; - } - return 1; -} - -static inline int cpu_max_xsave_size(void) -{ - unsigned long XSTATE_CPUID = 0xd; - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - - __cpuid_count(XSTATE_CPUID, 0, eax, ebx, ecx, edx); - return ecx; -} - -static inline u32 pkey_bit_position(int pkey) -{ - return pkey * PKEY_BITS_PER_PKEY; -} - -#define XSTATE_PKEY_BIT (9) -#define XSTATE_PKEY 0x200 -#define XSTATE_BV_OFFSET 512 - -int pkey_reg_xstate_offset(void) -{ - unsigned int eax; - unsigned int ebx; - unsigned int ecx; - unsigned int edx; - int xstate_offset; - int xstate_size; - unsigned long XSTATE_CPUID = 0xd; - int leaf; - - /* assume that XSTATE_PKEY is set in XCR0 */ - leaf = XSTATE_PKEY_BIT; - { - __cpuid_count(XSTATE_CPUID, leaf, eax, ebx, ecx, edx); - - if (leaf == XSTATE_PKEY_BIT) { - xstate_offset = ebx; - xstate_size = eax; - } - } - - if (xstate_size == 0) { - printf("could not find size/offset of PKEY in xsave state\n"); - return 0; - } - - return xstate_offset; -} - -static inline int get_arch_reserved_keys(void) -{ - return NR_RESERVED_PKEYS; -} - -void expect_fault_on_read_execonly_key(void *p1, int pkey) -{ - int ptr_contents; - - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - expected_pkey_fault(pkey); -} - -void *malloc_pkey_with_mprotect_subpage(long size, int prot, u16 pkey) -{ - return PTR_ERR_ENOTSUP; -} - -#endif /* _PKEYS_X86_H */ --- a/tools/testing/selftests/vm/protection_keys.c +++ /dev/null @@ -1,1788 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Tests Memory Protection Keys (see Documentation/core-api/protection-keys.rst) - * - * There are examples in here of: - * * how to set protection keys on memory - * * how to set/clear bits in pkey registers (the rights register) - * * how to handle SEGV_PKUERR signals and extract pkey-relevant - * information from the siginfo - * - * Things to add: - * make sure KSM and KSM COW breaking works - * prefault pages in at malloc, or not - * protect MPX bounds tables with protection keys? - * make sure VMA splitting/merging is working correctly - * OOMs can destroy mm->mmap (see exit_mmap()), so make sure it is immune to pkeys - * look for pkey "leaks" where it is still set on a VMA but "freed" back to the kernel - * do a plain mprotect() to a mprotect_pkey() area and make sure the pkey sticks - * - * Compile like this: - * gcc -mxsave -o protection_keys -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm - * gcc -mxsave -m32 -o protection_keys_32 -O2 -g -std=gnu99 -pthread -Wall protection_keys.c -lrt -ldl -lm - */ -#define _GNU_SOURCE -#define __SANE_USERSPACE_TYPES__ -#include <errno.h> -#include <linux/elf.h> -#include <linux/futex.h> -#include <time.h> -#include <sys/time.h> -#include <sys/syscall.h> -#include <string.h> -#include <stdio.h> -#include <stdint.h> -#include <stdbool.h> -#include <signal.h> -#include <assert.h> -#include <stdlib.h> -#include <ucontext.h> -#include <sys/mman.h> -#include <sys/types.h> -#include <sys/wait.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <unistd.h> -#include <sys/ptrace.h> -#include <setjmp.h> - -#include "pkey-helpers.h" - -int iteration_nr = 1; -int test_nr; - -u64 shadow_pkey_reg; -int dprint_in_signal; -char dprint_in_signal_buffer[DPRINT_IN_SIGNAL_BUF_SIZE]; - -void cat_into_file(char *str, char *file) -{ - int fd = open(file, O_RDWR); - int ret; - - dprintf2("%s(): writing '%s' to '%s'\n", __func__, str, file); - /* - * these need to be raw because they are called under - * pkey_assert() - */ - if (fd < 0) { - fprintf(stderr, "error opening '%s'\n", str); - perror("error: "); - exit(__LINE__); - } - - ret = write(fd, str, strlen(str)); - if (ret != strlen(str)) { - perror("write to file failed"); - fprintf(stderr, "filename: '%s' str: '%s'\n", file, str); - exit(__LINE__); - } - close(fd); -} - -#if CONTROL_TRACING > 0 -static int warned_tracing; -int tracing_root_ok(void) -{ - if (geteuid() != 0) { - if (!warned_tracing) - fprintf(stderr, "WARNING: not run as root, " - "can not do tracing control\n"); - warned_tracing = 1; - return 0; - } - return 1; -} -#endif - -void tracing_on(void) -{ -#if CONTROL_TRACING > 0 -#define TRACEDIR "/sys/kernel/debug/tracing" - char pidstr[32]; - - if (!tracing_root_ok()) - return; - - sprintf(pidstr, "%d", getpid()); - cat_into_file("0", TRACEDIR "/tracing_on"); - cat_into_file("\n", TRACEDIR "/trace"); - if (1) { - cat_into_file("function_graph", TRACEDIR "/current_tracer"); - cat_into_file("1", TRACEDIR "/options/funcgraph-proc"); - } else { - cat_into_file("nop", TRACEDIR "/current_tracer"); - } - cat_into_file(pidstr, TRACEDIR "/set_ftrace_pid"); - cat_into_file("1", TRACEDIR "/tracing_on"); - dprintf1("enabled tracing\n"); -#endif -} - -void tracing_off(void) -{ -#if CONTROL_TRACING > 0 - if (!tracing_root_ok()) - return; - cat_into_file("0", "/sys/kernel/debug/tracing/tracing_on"); -#endif -} - -void abort_hooks(void) -{ - fprintf(stderr, "running %s()...\n", __func__); - tracing_off(); -#ifdef SLEEP_ON_ABORT - sleep(SLEEP_ON_ABORT); -#endif -} - -/* - * This attempts to have roughly a page of instructions followed by a few - * instructions that do a write, and another page of instructions. That - * way, we are pretty sure that the write is in the second page of - * instructions and has at least a page of padding behind it. - * - * *That* lets us be sure to madvise() away the write instruction, which - * will then fault, which makes sure that the fault code handles - * execute-only memory properly. - */ -#ifdef __powerpc64__ -/* This way, both 4K and 64K alignment are maintained */ -__attribute__((__aligned__(65536))) -#else -__attribute__((__aligned__(PAGE_SIZE))) -#endif -void lots_o_noops_around_write(int *write_to_me) -{ - dprintf3("running %s()\n", __func__); - __page_o_noops(); - /* Assume this happens in the second page of instructions: */ - *write_to_me = __LINE__; - /* pad out by another page: */ - __page_o_noops(); - dprintf3("%s() done\n", __func__); -} - -void dump_mem(void *dumpme, int len_bytes) -{ - char *c = (void *)dumpme; - int i; - - for (i = 0; i < len_bytes; i += sizeof(u64)) { - u64 *ptr = (u64 *)(c + i); - dprintf1("dump[%03d][@%p]: %016llx\n", i, ptr, *ptr); - } -} - -static u32 hw_pkey_get(int pkey, unsigned long flags) -{ - u64 pkey_reg = __read_pkey_reg(); - - dprintf1("%s(pkey=%d, flags=%lx) = %x / %d\n", - __func__, pkey, flags, 0, 0); - dprintf2("%s() raw pkey_reg: %016llx\n", __func__, pkey_reg); - - return (u32) get_pkey_bits(pkey_reg, pkey); -} - -static int hw_pkey_set(int pkey, unsigned long rights, unsigned long flags) -{ - u32 mask = (PKEY_DISABLE_ACCESS|PKEY_DISABLE_WRITE); - u64 old_pkey_reg = __read_pkey_reg(); - u64 new_pkey_reg; - - /* make sure that 'rights' only contains the bits we expect: */ - assert(!(rights & ~mask)); - - /* modify bits accordingly in old pkey_reg and assign it */ - new_pkey_reg = set_pkey_bits(old_pkey_reg, pkey, rights); - - __write_pkey_reg(new_pkey_reg); - - dprintf3("%s(pkey=%d, rights=%lx, flags=%lx) = %x" - " pkey_reg now: %016llx old_pkey_reg: %016llx\n", - __func__, pkey, rights, flags, 0, __read_pkey_reg(), - old_pkey_reg); - return 0; -} - -void pkey_disable_set(int pkey, int flags) -{ - unsigned long syscall_flags = 0; - int ret; - int pkey_rights; - u64 orig_pkey_reg = read_pkey_reg(); - - dprintf1("START->%s(%d, 0x%x)\n", __func__, - pkey, flags); - pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - pkey_assert(pkey_rights >= 0); - - pkey_rights |= flags; - - ret = hw_pkey_set(pkey, pkey_rights, syscall_flags); - assert(!ret); - /* pkey_reg and flags have the same format */ - shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); - dprintf1("%s(%d) shadow: 0x%016llx\n", - __func__, pkey, shadow_pkey_reg); - - pkey_assert(ret >= 0); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - dprintf1("%s(%d) pkey_reg: 0x%016llx\n", - __func__, pkey, read_pkey_reg()); - if (flags) - pkey_assert(read_pkey_reg() >= orig_pkey_reg); - dprintf1("END<---%s(%d, 0x%x)\n", __func__, - pkey, flags); -} - -void pkey_disable_clear(int pkey, int flags) -{ - unsigned long syscall_flags = 0; - int ret; - int pkey_rights = hw_pkey_get(pkey, syscall_flags); - u64 orig_pkey_reg = read_pkey_reg(); - - pkey_assert(flags & (PKEY_DISABLE_ACCESS | PKEY_DISABLE_WRITE)); - - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - pkey_assert(pkey_rights >= 0); - - pkey_rights &= ~flags; - - ret = hw_pkey_set(pkey, pkey_rights, 0); - shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, pkey, pkey_rights); - pkey_assert(ret >= 0); - - pkey_rights = hw_pkey_get(pkey, syscall_flags); - dprintf1("%s(%d) hw_pkey_get(%d): %x\n", __func__, - pkey, pkey, pkey_rights); - - dprintf1("%s(%d) pkey_reg: 0x%016llx\n", __func__, - pkey, read_pkey_reg()); - if (flags) - assert(read_pkey_reg() <= orig_pkey_reg); -} - -void pkey_write_allow(int pkey) -{ - pkey_disable_clear(pkey, PKEY_DISABLE_WRITE); -} -void pkey_write_deny(int pkey) -{ - pkey_disable_set(pkey, PKEY_DISABLE_WRITE); -} -void pkey_access_allow(int pkey) -{ - pkey_disable_clear(pkey, PKEY_DISABLE_ACCESS); -} -void pkey_access_deny(int pkey) -{ - pkey_disable_set(pkey, PKEY_DISABLE_ACCESS); -} - -/* Failed address bound checks: */ -#ifndef SEGV_BNDERR -# define SEGV_BNDERR 3 -#endif - -#ifndef SEGV_PKUERR -# define SEGV_PKUERR 4 -#endif - -static char *si_code_str(int si_code) -{ - if (si_code == SEGV_MAPERR) - return "SEGV_MAPERR"; - if (si_code == SEGV_ACCERR) - return "SEGV_ACCERR"; - if (si_code == SEGV_BNDERR) - return "SEGV_BNDERR"; - if (si_code == SEGV_PKUERR) - return "SEGV_PKUERR"; - return "UNKNOWN"; -} - -int pkey_faults; -int last_si_pkey = -1; -void signal_handler(int signum, siginfo_t *si, void *vucontext) -{ - ucontext_t *uctxt = vucontext; - int trapno; - unsigned long ip; - char *fpregs; -#if defined(__i386__) || defined(__x86_64__) /* arch */ - u32 *pkey_reg_ptr; - int pkey_reg_offset; -#endif /* arch */ - u64 siginfo_pkey; - u32 *si_pkey_ptr; - - dprint_in_signal = 1; - dprintf1(">>>>===============SIGSEGV============================\n"); - dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", - __func__, __LINE__, - __read_pkey_reg(), shadow_pkey_reg); - - trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; - ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; - fpregs = (char *) uctxt->uc_mcontext.fpregs; - - dprintf2("%s() trapno: %d ip: 0x%016lx info->si_code: %s/%d\n", - __func__, trapno, ip, si_code_str(si->si_code), - si->si_code); - -#if defined(__i386__) || defined(__x86_64__) /* arch */ -#ifdef __i386__ - /* - * 32-bit has some extra padding so that userspace can tell whether - * the XSTATE header is present in addition to the "legacy" FPU - * state. We just assume that it is here. - */ - fpregs += 0x70; -#endif /* i386 */ - pkey_reg_offset = pkey_reg_xstate_offset(); - pkey_reg_ptr = (void *)(&fpregs[pkey_reg_offset]); - - /* - * If we got a PKEY fault, we *HAVE* to have at least one bit set in - * here. - */ - dprintf1("pkey_reg_xstate_offset: %d\n", pkey_reg_xstate_offset()); - if (DEBUG_LEVEL > 4) - dump_mem(pkey_reg_ptr - 128, 256); - pkey_assert(*pkey_reg_ptr); -#endif /* arch */ - - dprintf1("siginfo: %p\n", si); - dprintf1(" fpregs: %p\n", fpregs); - - if ((si->si_code == SEGV_MAPERR) || - (si->si_code == SEGV_ACCERR) || - (si->si_code == SEGV_BNDERR)) { - printf("non-PK si_code, exiting...\n"); - exit(4); - } - - si_pkey_ptr = siginfo_get_pkey_ptr(si); - dprintf1("si_pkey_ptr: %p\n", si_pkey_ptr); - dump_mem((u8 *)si_pkey_ptr - 8, 24); - siginfo_pkey = *si_pkey_ptr; - pkey_assert(siginfo_pkey < NR_PKEYS); - last_si_pkey = siginfo_pkey; - - /* - * need __read_pkey_reg() version so we do not do shadow_pkey_reg - * checking - */ - dprintf1("signal pkey_reg from pkey_reg: %016llx\n", - __read_pkey_reg()); - dprintf1("pkey from siginfo: %016llx\n", siginfo_pkey); -#if defined(__i386__) || defined(__x86_64__) /* arch */ - dprintf1("signal pkey_reg from xsave: %08x\n", *pkey_reg_ptr); - *(u64 *)pkey_reg_ptr = 0x00000000; - dprintf1("WARNING: set PKEY_REG=0 to allow faulting instruction to continue\n"); -#elif defined(__powerpc64__) /* arch */ - /* restore access and let the faulting instruction continue */ - pkey_access_allow(siginfo_pkey); -#endif /* arch */ - pkey_faults++; - dprintf1("<<<<==================================================\n"); - dprint_in_signal = 0; -} - -int wait_all_children(void) -{ - int status; - return waitpid(-1, &status, 0); -} - -void sig_chld(int x) -{ - dprint_in_signal = 1; - dprintf2("[%d] SIGCHLD: %d\n", getpid(), x); - dprint_in_signal = 0; -} - -void setup_sigsegv_handler(void) -{ - int r, rs; - struct sigaction newact; - struct sigaction oldact; - - /* #PF is mapped to sigsegv */ - int signum = SIGSEGV; - - newact.sa_handler = 0; - newact.sa_sigaction = signal_handler; - - /*sigset_t - signals to block while in the handler */ - /* get the old signal mask. */ - rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask); - pkey_assert(rs == 0); - - /* call sa_sigaction, not sa_handler*/ - newact.sa_flags = SA_SIGINFO; - - newact.sa_restorer = 0; /* void(*)(), obsolete */ - r = sigaction(signum, &newact, &oldact); - r = sigaction(SIGALRM, &newact, &oldact); - pkey_assert(r == 0); -} - -void setup_handlers(void) -{ - signal(SIGCHLD, &sig_chld); - setup_sigsegv_handler(); -} - -pid_t fork_lazy_child(void) -{ - pid_t forkret; - - forkret = fork(); - pkey_assert(forkret >= 0); - dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); - - if (!forkret) { - /* in the child */ - while (1) { - dprintf1("child sleeping...\n"); - sleep(30); - } - } - return forkret; -} - -int sys_mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, - unsigned long pkey) -{ - int sret; - - dprintf2("%s(0x%p, %zx, prot=%lx, pkey=%lx)\n", __func__, - ptr, size, orig_prot, pkey); - - errno = 0; - sret = syscall(SYS_mprotect_key, ptr, size, orig_prot, pkey); - if (errno) { - dprintf2("SYS_mprotect_key sret: %d\n", sret); - dprintf2("SYS_mprotect_key prot: 0x%lx\n", orig_prot); - dprintf2("SYS_mprotect_key failed, errno: %d\n", errno); - if (DEBUG_LEVEL >= 2) - perror("SYS_mprotect_pkey"); - } - return sret; -} - -int sys_pkey_alloc(unsigned long flags, unsigned long init_val) -{ - int ret = syscall(SYS_pkey_alloc, flags, init_val); - dprintf1("%s(flags=%lx, init_val=%lx) syscall ret: %d errno: %d\n", - __func__, flags, init_val, ret, errno); - return ret; -} - -int alloc_pkey(void) -{ - int ret; - unsigned long init_val = 0x0; - - dprintf1("%s()::%d, pkey_reg: 0x%016llx shadow: %016llx\n", - __func__, __LINE__, __read_pkey_reg(), shadow_pkey_reg); - ret = sys_pkey_alloc(0, init_val); - /* - * pkey_alloc() sets PKEY register, so we need to reflect it in - * shadow_pkey_reg: - */ - dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", - __func__, __LINE__, ret, __read_pkey_reg(), - shadow_pkey_reg); - if (ret > 0) { - /* clear both the bits: */ - shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, - ~PKEY_MASK); - dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", - __func__, - __LINE__, ret, __read_pkey_reg(), - shadow_pkey_reg); - /* - * move the new state in from init_val - * (remember, we cheated and init_val == pkey_reg format) - */ - shadow_pkey_reg = set_pkey_bits(shadow_pkey_reg, ret, - init_val); - } - dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", - __func__, __LINE__, ret, __read_pkey_reg(), - shadow_pkey_reg); - dprintf1("%s()::%d errno: %d\n", __func__, __LINE__, errno); - /* for shadow checking: */ - read_pkey_reg(); - dprintf4("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", - __func__, __LINE__, ret, __read_pkey_reg(), - shadow_pkey_reg); - return ret; -} - -int sys_pkey_free(unsigned long pkey) -{ - int ret = syscall(SYS_pkey_free, pkey); - dprintf1("%s(pkey=%ld) syscall ret: %d\n", __func__, pkey, ret); - return ret; -} - -/* - * I had a bug where pkey bits could be set by mprotect() but - * not cleared. This ensures we get lots of random bit sets - * and clears on the vma and pte pkey bits. - */ -int alloc_random_pkey(void) -{ - int max_nr_pkey_allocs; - int ret; - int i; - int alloced_pkeys[NR_PKEYS]; - int nr_alloced = 0; - int random_index; - memset(alloced_pkeys, 0, sizeof(alloced_pkeys)); - - /* allocate every possible key and make a note of which ones we got */ - max_nr_pkey_allocs = NR_PKEYS; - for (i = 0; i < max_nr_pkey_allocs; i++) { - int new_pkey = alloc_pkey(); - if (new_pkey < 0) - break; - alloced_pkeys[nr_alloced++] = new_pkey; - } - - pkey_assert(nr_alloced > 0); - /* select a random one out of the allocated ones */ - random_index = rand() % nr_alloced; - ret = alloced_pkeys[random_index]; - /* now zero it out so we don't free it next */ - alloced_pkeys[random_index] = 0; - - /* go through the allocated ones that we did not want and free them */ - for (i = 0; i < nr_alloced; i++) { - int free_ret; - if (!alloced_pkeys[i]) - continue; - free_ret = sys_pkey_free(alloced_pkeys[i]); - pkey_assert(!free_ret); - } - dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", __func__, - __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); - return ret; -} - -int mprotect_pkey(void *ptr, size_t size, unsigned long orig_prot, - unsigned long pkey) -{ - int nr_iterations = random() % 100; - int ret; - - while (0) { - int rpkey = alloc_random_pkey(); - ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); - dprintf1("sys_mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", - ptr, size, orig_prot, pkey, ret); - if (nr_iterations-- < 0) - break; - - dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", - __func__, __LINE__, ret, __read_pkey_reg(), - shadow_pkey_reg); - sys_pkey_free(rpkey); - dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", - __func__, __LINE__, ret, __read_pkey_reg(), - shadow_pkey_reg); - } - pkey_assert(pkey < NR_PKEYS); - - ret = sys_mprotect_pkey(ptr, size, orig_prot, pkey); - dprintf1("mprotect_pkey(%p, %zx, prot=0x%lx, pkey=%ld) ret: %d\n", - ptr, size, orig_prot, pkey, ret); - pkey_assert(!ret); - dprintf1("%s()::%d, ret: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", __func__, - __LINE__, ret, __read_pkey_reg(), shadow_pkey_reg); - return ret; -} - -struct pkey_malloc_record { - void *ptr; - long size; - int prot; -}; -struct pkey_malloc_record *pkey_malloc_records; -struct pkey_malloc_record *pkey_last_malloc_record; -long nr_pkey_malloc_records; -void record_pkey_malloc(void *ptr, long size, int prot) -{ - long i; - struct pkey_malloc_record *rec = NULL; - - for (i = 0; i < nr_pkey_malloc_records; i++) { - rec = &pkey_malloc_records[i]; - /* find a free record */ - if (rec) - break; - } - if (!rec) { - /* every record is full */ - size_t old_nr_records = nr_pkey_malloc_records; - size_t new_nr_records = (nr_pkey_malloc_records * 2 + 1); - size_t new_size = new_nr_records * sizeof(struct pkey_malloc_record); - dprintf2("new_nr_records: %zd\n", new_nr_records); - dprintf2("new_size: %zd\n", new_size); - pkey_malloc_records = realloc(pkey_malloc_records, new_size); - pkey_assert(pkey_malloc_records != NULL); - rec = &pkey_malloc_records[nr_pkey_malloc_records]; - /* - * realloc() does not initialize memory, so zero it from - * the first new record all the way to the end. - */ - for (i = 0; i < new_nr_records - old_nr_records; i++) - memset(rec + i, 0, sizeof(*rec)); - } - dprintf3("filling malloc record[%d/%p]: {%p, %ld}\n", - (int)(rec - pkey_malloc_records), rec, ptr, size); - rec->ptr = ptr; - rec->size = size; - rec->prot = prot; - pkey_last_malloc_record = rec; - nr_pkey_malloc_records++; -} - -void free_pkey_malloc(void *ptr) -{ - long i; - int ret; - dprintf3("%s(%p)\n", __func__, ptr); - for (i = 0; i < nr_pkey_malloc_records; i++) { - struct pkey_malloc_record *rec = &pkey_malloc_records[i]; - dprintf4("looking for ptr %p at record[%ld/%p]: {%p, %ld}\n", - ptr, i, rec, rec->ptr, rec->size); - if ((ptr < rec->ptr) || - (ptr >= rec->ptr + rec->size)) - continue; - - dprintf3("found ptr %p at record[%ld/%p]: {%p, %ld}\n", - ptr, i, rec, rec->ptr, rec->size); - nr_pkey_malloc_records--; - ret = munmap(rec->ptr, rec->size); - dprintf3("munmap ret: %d\n", ret); - pkey_assert(!ret); - dprintf3("clearing rec->ptr, rec: %p\n", rec); - rec->ptr = NULL; - dprintf3("done clearing rec->ptr, rec: %p\n", rec); - return; - } - pkey_assert(false); -} - - -void *malloc_pkey_with_mprotect(long size, int prot, u16 pkey) -{ - void *ptr; - int ret; - - read_pkey_reg(); - dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, - size, prot, pkey); - pkey_assert(pkey < NR_PKEYS); - ptr = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - pkey_assert(ptr != (void *)-1); - ret = mprotect_pkey((void *)ptr, PAGE_SIZE, prot, pkey); - pkey_assert(!ret); - record_pkey_malloc(ptr, size, prot); - read_pkey_reg(); - - dprintf1("%s() for pkey %d @ %p\n", __func__, pkey, ptr); - return ptr; -} - -void *malloc_pkey_anon_huge(long size, int prot, u16 pkey) -{ - int ret; - void *ptr; - - dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, - size, prot, pkey); - /* - * Guarantee we can fit at least one huge page in the resulting - * allocation by allocating space for 2: - */ - size = ALIGN_UP(size, HPAGE_SIZE * 2); - ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - pkey_assert(ptr != (void *)-1); - record_pkey_malloc(ptr, size, prot); - mprotect_pkey(ptr, size, prot, pkey); - - dprintf1("unaligned ptr: %p\n", ptr); - ptr = ALIGN_PTR_UP(ptr, HPAGE_SIZE); - dprintf1(" aligned ptr: %p\n", ptr); - ret = madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE); - dprintf1("MADV_HUGEPAGE ret: %d\n", ret); - ret = madvise(ptr, HPAGE_SIZE, MADV_WILLNEED); - dprintf1("MADV_WILLNEED ret: %d\n", ret); - memset(ptr, 0, HPAGE_SIZE); - - dprintf1("mmap()'d thp for pkey %d @ %p\n", pkey, ptr); - return ptr; -} - -int hugetlb_setup_ok; -#define SYSFS_FMT_NR_HUGE_PAGES "/sys/kernel/mm/hugepages/hugepages-%ldkB/nr_hugepages" -#define GET_NR_HUGE_PAGES 10 -void setup_hugetlbfs(void) -{ - int err; - int fd; - char buf[256]; - long hpagesz_kb; - long hpagesz_mb; - - if (geteuid() != 0) { - fprintf(stderr, "WARNING: not run as root, can not do hugetlb test\n"); - return; - } - - cat_into_file(__stringify(GET_NR_HUGE_PAGES), "/proc/sys/vm/nr_hugepages"); - - /* - * Now go make sure that we got the pages and that they - * are PMD-level pages. Someone might have made PUD-level - * pages the default. - */ - hpagesz_kb = HPAGE_SIZE / 1024; - hpagesz_mb = hpagesz_kb / 1024; - sprintf(buf, SYSFS_FMT_NR_HUGE_PAGES, hpagesz_kb); - fd = open(buf, O_RDONLY); - if (fd < 0) { - fprintf(stderr, "opening sysfs %ldM hugetlb config: %s\n", - hpagesz_mb, strerror(errno)); - return; - } - - /* -1 to guarantee leaving the trailing \0 */ - err = read(fd, buf, sizeof(buf)-1); - close(fd); - if (err <= 0) { - fprintf(stderr, "reading sysfs %ldM hugetlb config: %s\n", - hpagesz_mb, strerror(errno)); - return; - } - - if (atoi(buf) != GET_NR_HUGE_PAGES) { - fprintf(stderr, "could not confirm %ldM pages, got: '%s' expected %d\n", - hpagesz_mb, buf, GET_NR_HUGE_PAGES); - return; - } - - hugetlb_setup_ok = 1; -} - -void *malloc_pkey_hugetlb(long size, int prot, u16 pkey) -{ - void *ptr; - int flags = MAP_ANONYMOUS|MAP_PRIVATE|MAP_HUGETLB; - - if (!hugetlb_setup_ok) - return PTR_ERR_ENOTSUP; - - dprintf1("doing %s(%ld, %x, %x)\n", __func__, size, prot, pkey); - size = ALIGN_UP(size, HPAGE_SIZE * 2); - pkey_assert(pkey < NR_PKEYS); - ptr = mmap(NULL, size, PROT_NONE, flags, -1, 0); - pkey_assert(ptr != (void *)-1); - mprotect_pkey(ptr, size, prot, pkey); - - record_pkey_malloc(ptr, size, prot); - - dprintf1("mmap()'d hugetlbfs for pkey %d @ %p\n", pkey, ptr); - return ptr; -} - -void *malloc_pkey_mmap_dax(long size, int prot, u16 pkey) -{ - void *ptr; - int fd; - - dprintf1("doing %s(size=%ld, prot=0x%x, pkey=%d)\n", __func__, - size, prot, pkey); - pkey_assert(pkey < NR_PKEYS); - fd = open("/dax/foo", O_RDWR); - pkey_assert(fd >= 0); - - ptr = mmap(0, size, prot, MAP_SHARED, fd, 0); - pkey_assert(ptr != (void *)-1); - - mprotect_pkey(ptr, size, prot, pkey); - - record_pkey_malloc(ptr, size, prot); - - dprintf1("mmap()'d for pkey %d @ %p\n", pkey, ptr); - close(fd); - return ptr; -} - -void *(*pkey_malloc[])(long size, int prot, u16 pkey) = { - - malloc_pkey_with_mprotect, - malloc_pkey_with_mprotect_subpage, - malloc_pkey_anon_huge, - malloc_pkey_hugetlb -/* can not do direct with the pkey_mprotect() API: - malloc_pkey_mmap_direct, - malloc_pkey_mmap_dax, -*/ -}; - -void *malloc_pkey(long size, int prot, u16 pkey) -{ - void *ret; - static int malloc_type; - int nr_malloc_types = ARRAY_SIZE(pkey_malloc); - - pkey_assert(pkey < NR_PKEYS); - - while (1) { - pkey_assert(malloc_type < nr_malloc_types); - - ret = pkey_malloc[malloc_type](size, prot, pkey); - pkey_assert(ret != (void *)-1); - - malloc_type++; - if (malloc_type >= nr_malloc_types) - malloc_type = (random()%nr_malloc_types); - - /* try again if the malloc_type we tried is unsupported */ - if (ret == PTR_ERR_ENOTSUP) - continue; - - break; - } - - dprintf3("%s(%ld, prot=%x, pkey=%x) returning: %p\n", __func__, - size, prot, pkey, ret); - return ret; -} - -int last_pkey_faults; -#define UNKNOWN_PKEY -2 -void expected_pkey_fault(int pkey) -{ - dprintf2("%s(): last_pkey_faults: %d pkey_faults: %d\n", - __func__, last_pkey_faults, pkey_faults); - dprintf2("%s(%d): last_si_pkey: %d\n", __func__, pkey, last_si_pkey); - pkey_assert(last_pkey_faults + 1 == pkey_faults); - - /* - * For exec-only memory, we do not know the pkey in - * advance, so skip this check. - */ - if (pkey != UNKNOWN_PKEY) - pkey_assert(last_si_pkey == pkey); - -#if defined(__i386__) || defined(__x86_64__) /* arch */ - /* - * The signal handler shold have cleared out PKEY register to let the - * test program continue. We now have to restore it. - */ - if (__read_pkey_reg() != 0) -#else /* arch */ - if (__read_pkey_reg() != shadow_pkey_reg) -#endif /* arch */ - pkey_assert(0); - - __write_pkey_reg(shadow_pkey_reg); - dprintf1("%s() set pkey_reg=%016llx to restore state after signal " - "nuked it\n", __func__, shadow_pkey_reg); - last_pkey_faults = pkey_faults; - last_si_pkey = -1; -} - -#define do_not_expect_pkey_fault(msg) do { \ - if (last_pkey_faults != pkey_faults) \ - dprintf0("unexpected PKey fault: %s\n", msg); \ - pkey_assert(last_pkey_faults == pkey_faults); \ -} while (0) - -int test_fds[10] = { -1 }; -int nr_test_fds; -void __save_test_fd(int fd) -{ - pkey_assert(fd >= 0); - pkey_assert(nr_test_fds < ARRAY_SIZE(test_fds)); - test_fds[nr_test_fds] = fd; - nr_test_fds++; -} - -int get_test_read_fd(void) -{ - int test_fd = open("/etc/passwd", O_RDONLY); - __save_test_fd(test_fd); - return test_fd; -} - -void close_test_fds(void) -{ - int i; - - for (i = 0; i < nr_test_fds; i++) { - if (test_fds[i] < 0) - continue; - close(test_fds[i]); - test_fds[i] = -1; - } - nr_test_fds = 0; -} - -#define barrier() __asm__ __volatile__("": : :"memory") -__attribute__((noinline)) int read_ptr(int *ptr) -{ - /* - * Keep GCC from optimizing this away somehow - */ - barrier(); - return *ptr; -} - -void test_pkey_alloc_free_attach_pkey0(int *ptr, u16 pkey) -{ - int i, err; - int max_nr_pkey_allocs; - int alloced_pkeys[NR_PKEYS]; - int nr_alloced = 0; - long size; - - pkey_assert(pkey_last_malloc_record); - size = pkey_last_malloc_record->size; - /* - * This is a bit of a hack. But mprotect() requires - * huge-page-aligned sizes when operating on hugetlbfs. - * So, make sure that we use something that's a multiple - * of a huge page when we can. - */ - if (size >= HPAGE_SIZE) - size = HPAGE_SIZE; - - /* allocate every possible key and make sure key-0 never got allocated */ - max_nr_pkey_allocs = NR_PKEYS; - for (i = 0; i < max_nr_pkey_allocs; i++) { - int new_pkey = alloc_pkey(); - pkey_assert(new_pkey != 0); - - if (new_pkey < 0) - break; - alloced_pkeys[nr_alloced++] = new_pkey; - } - /* free all the allocated keys */ - for (i = 0; i < nr_alloced; i++) { - int free_ret; - - if (!alloced_pkeys[i]) - continue; - free_ret = sys_pkey_free(alloced_pkeys[i]); - pkey_assert(!free_ret); - } - - /* attach key-0 in various modes */ - err = sys_mprotect_pkey(ptr, size, PROT_READ, 0); - pkey_assert(!err); - err = sys_mprotect_pkey(ptr, size, PROT_WRITE, 0); - pkey_assert(!err); - err = sys_mprotect_pkey(ptr, size, PROT_EXEC, 0); - pkey_assert(!err); - err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE, 0); - pkey_assert(!err); - err = sys_mprotect_pkey(ptr, size, PROT_READ|PROT_WRITE|PROT_EXEC, 0); - pkey_assert(!err); -} - -void test_read_of_write_disabled_region(int *ptr, u16 pkey) -{ - int ptr_contents; - - dprintf1("disabling write access to PKEY[1], doing read\n"); - pkey_write_deny(pkey); - ptr_contents = read_ptr(ptr); - dprintf1("*ptr: %d\n", ptr_contents); - dprintf1("\n"); -} -void test_read_of_access_disabled_region(int *ptr, u16 pkey) -{ - int ptr_contents; - - dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", pkey, ptr); - read_pkey_reg(); - pkey_access_deny(pkey); - ptr_contents = read_ptr(ptr); - dprintf1("*ptr: %d\n", ptr_contents); - expected_pkey_fault(pkey); -} - -void test_read_of_access_disabled_region_with_page_already_mapped(int *ptr, - u16 pkey) -{ - int ptr_contents; - - dprintf1("disabling access to PKEY[%02d], doing read @ %p\n", - pkey, ptr); - ptr_contents = read_ptr(ptr); - dprintf1("reading ptr before disabling the read : %d\n", - ptr_contents); - read_pkey_reg(); - pkey_access_deny(pkey); - ptr_contents = read_ptr(ptr); - dprintf1("*ptr: %d\n", ptr_contents); - expected_pkey_fault(pkey); -} - -void test_write_of_write_disabled_region_with_page_already_mapped(int *ptr, - u16 pkey) -{ - *ptr = __LINE__; - dprintf1("disabling write access; after accessing the page, " - "to PKEY[%02d], doing write\n", pkey); - pkey_write_deny(pkey); - *ptr = __LINE__; - expected_pkey_fault(pkey); -} - -void test_write_of_write_disabled_region(int *ptr, u16 pkey) -{ - dprintf1("disabling write access to PKEY[%02d], doing write\n", pkey); - pkey_write_deny(pkey); - *ptr = __LINE__; - expected_pkey_fault(pkey); -} -void test_write_of_access_disabled_region(int *ptr, u16 pkey) -{ - dprintf1("disabling access to PKEY[%02d], doing write\n", pkey); - pkey_access_deny(pkey); - *ptr = __LINE__; - expected_pkey_fault(pkey); -} - -void test_write_of_access_disabled_region_with_page_already_mapped(int *ptr, - u16 pkey) -{ - *ptr = __LINE__; - dprintf1("disabling access; after accessing the page, " - " to PKEY[%02d], doing write\n", pkey); - pkey_access_deny(pkey); - *ptr = __LINE__; - expected_pkey_fault(pkey); -} - -void test_kernel_write_of_access_disabled_region(int *ptr, u16 pkey) -{ - int ret; - int test_fd = get_test_read_fd(); - - dprintf1("disabling access to PKEY[%02d], " - "having kernel read() to buffer\n", pkey); - pkey_access_deny(pkey); - ret = read(test_fd, ptr, 1); - dprintf1("read ret: %d\n", ret); - pkey_assert(ret); -} -void test_kernel_write_of_write_disabled_region(int *ptr, u16 pkey) -{ - int ret; - int test_fd = get_test_read_fd(); - - pkey_write_deny(pkey); - ret = read(test_fd, ptr, 100); - dprintf1("read ret: %d\n", ret); - if (ret < 0 && (DEBUG_LEVEL > 0)) - perror("verbose read result (OK for this to be bad)"); - pkey_assert(ret); -} - -void test_kernel_gup_of_access_disabled_region(int *ptr, u16 pkey) -{ - int pipe_ret, vmsplice_ret; - struct iovec iov; - int pipe_fds[2]; - - pipe_ret = pipe(pipe_fds); - - pkey_assert(pipe_ret == 0); - dprintf1("disabling access to PKEY[%02d], " - "having kernel vmsplice from buffer\n", pkey); - pkey_access_deny(pkey); - iov.iov_base = ptr; - iov.iov_len = PAGE_SIZE; - vmsplice_ret = vmsplice(pipe_fds[1], &iov, 1, SPLICE_F_GIFT); - dprintf1("vmsplice() ret: %d\n", vmsplice_ret); - pkey_assert(vmsplice_ret == -1); - - close(pipe_fds[0]); - close(pipe_fds[1]); -} - -void test_kernel_gup_write_to_write_disabled_region(int *ptr, u16 pkey) -{ - int ignored = 0xdada; - int futex_ret; - int some_int = __LINE__; - - dprintf1("disabling write to PKEY[%02d], " - "doing futex gunk in buffer\n", pkey); - *ptr = some_int; - pkey_write_deny(pkey); - futex_ret = syscall(SYS_futex, ptr, FUTEX_WAIT, some_int-1, NULL, - &ignored, ignored); - if (DEBUG_LEVEL > 0) - perror("futex"); - dprintf1("futex() ret: %d\n", futex_ret); -} - -/* Assumes that all pkeys other than 'pkey' are unallocated */ -void test_pkey_syscalls_on_non_allocated_pkey(int *ptr, u16 pkey) -{ - int err; - int i; - - /* Note: 0 is the default pkey, so don't mess with it */ - for (i = 1; i < NR_PKEYS; i++) { - if (pkey == i) - continue; - - dprintf1("trying get/set/free to non-allocated pkey: %2d\n", i); - err = sys_pkey_free(i); - pkey_assert(err); - - err = sys_pkey_free(i); - pkey_assert(err); - - err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, i); - pkey_assert(err); - } -} - -/* Assumes that all pkeys other than 'pkey' are unallocated */ -void test_pkey_syscalls_bad_args(int *ptr, u16 pkey) -{ - int err; - int bad_pkey = NR_PKEYS+99; - - /* pass a known-invalid pkey in: */ - err = sys_mprotect_pkey(ptr, PAGE_SIZE, PROT_READ, bad_pkey); - pkey_assert(err); -} - -void become_child(void) -{ - pid_t forkret; - - forkret = fork(); - pkey_assert(forkret >= 0); - dprintf3("[%d] fork() ret: %d\n", getpid(), forkret); - - if (!forkret) { - /* in the child */ - return; - } - exit(0); -} - -/* Assumes that all pkeys other than 'pkey' are unallocated */ -void test_pkey_alloc_exhaust(int *ptr, u16 pkey) -{ - int err; - int allocated_pkeys[NR_PKEYS] = {0}; - int nr_allocated_pkeys = 0; - int i; - - for (i = 0; i < NR_PKEYS*3; i++) { - int new_pkey; - dprintf1("%s() alloc loop: %d\n", __func__, i); - new_pkey = alloc_pkey(); - dprintf4("%s()::%d, err: %d pkey_reg: 0x%016llx" - " shadow: 0x%016llx\n", - __func__, __LINE__, err, __read_pkey_reg(), - shadow_pkey_reg); - read_pkey_reg(); /* for shadow checking */ - dprintf2("%s() errno: %d ENOSPC: %d\n", __func__, errno, ENOSPC); - if ((new_pkey == -1) && (errno == ENOSPC)) { - dprintf2("%s() failed to allocate pkey after %d tries\n", - __func__, nr_allocated_pkeys); - } else { - /* - * Ensure the number of successes never - * exceeds the number of keys supported - * in the hardware. - */ - pkey_assert(nr_allocated_pkeys < NR_PKEYS); - allocated_pkeys[nr_allocated_pkeys++] = new_pkey; - } - - /* - * Make sure that allocation state is properly - * preserved across fork(). - */ - if (i == NR_PKEYS*2) - become_child(); - } - - dprintf3("%s()::%d\n", __func__, __LINE__); - - /* - * On x86: - * There are 16 pkeys supported in hardware. Three are - * allocated by the time we get here: - * 1. The default key (0) - * 2. One possibly consumed by an execute-only mapping. - * 3. One allocated by the test code and passed in via - * 'pkey' to this function. - * Ensure that we can allocate at least another 13 (16-3). - * - * On powerpc: - * There are either 5, 28, 29 or 32 pkeys supported in - * hardware depending on the page size (4K or 64K) and - * platform (powernv or powervm). Four are allocated by - * the time we get here. These include pkey-0, pkey-1, - * exec-only pkey and the one allocated by the test code. - * Ensure that we can allocate the remaining. - */ - pkey_assert(i >= (NR_PKEYS - get_arch_reserved_keys() - 1)); - - for (i = 0; i < nr_allocated_pkeys; i++) { - err = sys_pkey_free(allocated_pkeys[i]); - pkey_assert(!err); - read_pkey_reg(); /* for shadow checking */ - } -} - -void arch_force_pkey_reg_init(void) -{ -#if defined(__i386__) || defined(__x86_64__) /* arch */ - u64 *buf; - - /* - * All keys should be allocated and set to allow reads and - * writes, so the register should be all 0. If not, just - * skip the test. - */ - if (read_pkey_reg()) - return; - - /* - * Just allocate an absurd about of memory rather than - * doing the XSAVE size enumeration dance. - */ - buf = mmap(NULL, 1*MB, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - - /* These __builtins require compiling with -mxsave */ - - /* XSAVE to build a valid buffer: */ - __builtin_ia32_xsave(buf, XSTATE_PKEY); - /* Clear XSTATE_BV[PKRU]: */ - buf[XSTATE_BV_OFFSET/sizeof(u64)] &= ~XSTATE_PKEY; - /* XRSTOR will likely get PKRU back to the init state: */ - __builtin_ia32_xrstor(buf, XSTATE_PKEY); - - munmap(buf, 1*MB); -#endif -} - - -/* - * This is mostly useless on ppc for now. But it will not - * hurt anything and should give some better coverage as - * a long-running test that continually checks the pkey - * register. - */ -void test_pkey_init_state(int *ptr, u16 pkey) -{ - int err; - int allocated_pkeys[NR_PKEYS] = {0}; - int nr_allocated_pkeys = 0; - int i; - - for (i = 0; i < NR_PKEYS; i++) { - int new_pkey = alloc_pkey(); - - if (new_pkey < 0) - continue; - allocated_pkeys[nr_allocated_pkeys++] = new_pkey; - } - - dprintf3("%s()::%d\n", __func__, __LINE__); - - arch_force_pkey_reg_init(); - - /* - * Loop for a bit, hoping to get exercise the kernel - * context switch code. - */ - for (i = 0; i < 1000000; i++) - read_pkey_reg(); - - for (i = 0; i < nr_allocated_pkeys; i++) { - err = sys_pkey_free(allocated_pkeys[i]); - pkey_assert(!err); - read_pkey_reg(); /* for shadow checking */ - } -} - -/* - * pkey 0 is special. It is allocated by default, so you do not - * have to call pkey_alloc() to use it first. Make sure that it - * is usable. - */ -void test_mprotect_with_pkey_0(int *ptr, u16 pkey) -{ - long size; - int prot; - - assert(pkey_last_malloc_record); - size = pkey_last_malloc_record->size; - /* - * This is a bit of a hack. But mprotect() requires - * huge-page-aligned sizes when operating on hugetlbfs. - * So, make sure that we use something that's a multiple - * of a huge page when we can. - */ - if (size >= HPAGE_SIZE) - size = HPAGE_SIZE; - prot = pkey_last_malloc_record->prot; - - /* Use pkey 0 */ - mprotect_pkey(ptr, size, prot, 0); - - /* Make sure that we can set it back to the original pkey. */ - mprotect_pkey(ptr, size, prot, pkey); -} - -void test_ptrace_of_child(int *ptr, u16 pkey) -{ - __attribute__((__unused__)) int peek_result; - pid_t child_pid; - void *ignored = 0; - long ret; - int status; - /* - * This is the "control" for our little expermient. Make sure - * we can always access it when ptracing. - */ - int *plain_ptr_unaligned = malloc(HPAGE_SIZE); - int *plain_ptr = ALIGN_PTR_UP(plain_ptr_unaligned, PAGE_SIZE); - - /* - * Fork a child which is an exact copy of this process, of course. - * That means we can do all of our tests via ptrace() and then plain - * memory access and ensure they work differently. - */ - child_pid = fork_lazy_child(); - dprintf1("[%d] child pid: %d\n", getpid(), child_pid); - - ret = ptrace(PTRACE_ATTACH, child_pid, ignored, ignored); - if (ret) - perror("attach"); - dprintf1("[%d] attach ret: %ld %d\n", getpid(), ret, __LINE__); - pkey_assert(ret != -1); - ret = waitpid(child_pid, &status, WUNTRACED); - if ((ret != child_pid) || !(WIFSTOPPED(status))) { - fprintf(stderr, "weird waitpid result %ld stat %x\n", - ret, status); - pkey_assert(0); - } - dprintf2("waitpid ret: %ld\n", ret); - dprintf2("waitpid status: %d\n", status); - - pkey_access_deny(pkey); - pkey_write_deny(pkey); - - /* Write access, untested for now: - ret = ptrace(PTRACE_POKEDATA, child_pid, peek_at, data); - pkey_assert(ret != -1); - dprintf1("poke at %p: %ld\n", peek_at, ret); - */ - - /* - * Try to access the pkey-protected "ptr" via ptrace: - */ - ret = ptrace(PTRACE_PEEKDATA, child_pid, ptr, ignored); - /* expect it to work, without an error: */ - pkey_assert(ret != -1); - /* Now access from the current task, and expect an exception: */ - peek_result = read_ptr(ptr); - expected_pkey_fault(pkey); - - /* - * Try to access the NON-pkey-protected "plain_ptr" via ptrace: - */ - ret = ptrace(PTRACE_PEEKDATA, child_pid, plain_ptr, ignored); - /* expect it to work, without an error: */ - pkey_assert(ret != -1); - /* Now access from the current task, and expect NO exception: */ - peek_result = read_ptr(plain_ptr); - do_not_expect_pkey_fault("read plain pointer after ptrace"); - - ret = ptrace(PTRACE_DETACH, child_pid, ignored, 0); - pkey_assert(ret != -1); - - ret = kill(child_pid, SIGKILL); - pkey_assert(ret != -1); - - wait(&status); - - free(plain_ptr_unaligned); -} - -void *get_pointer_to_instructions(void) -{ - void *p1; - - p1 = ALIGN_PTR_UP(&lots_o_noops_around_write, PAGE_SIZE); - dprintf3("&lots_o_noops: %p\n", &lots_o_noops_around_write); - /* lots_o_noops_around_write should be page-aligned already */ - assert(p1 == &lots_o_noops_around_write); - - /* Point 'p1' at the *second* page of the function: */ - p1 += PAGE_SIZE; - - /* - * Try to ensure we fault this in on next touch to ensure - * we get an instruction fault as opposed to a data one - */ - madvise(p1, PAGE_SIZE, MADV_DONTNEED); - - return p1; -} - -void test_executing_on_unreadable_memory(int *ptr, u16 pkey) -{ - void *p1; - int scratch; - int ptr_contents; - int ret; - - p1 = get_pointer_to_instructions(); - lots_o_noops_around_write(&scratch); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - - ret = mprotect_pkey(p1, PAGE_SIZE, PROT_EXEC, (u64)pkey); - pkey_assert(!ret); - pkey_access_deny(pkey); - - dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); - - /* - * Make sure this is an *instruction* fault - */ - madvise(p1, PAGE_SIZE, MADV_DONTNEED); - lots_o_noops_around_write(&scratch); - do_not_expect_pkey_fault("executing on PROT_EXEC memory"); - expect_fault_on_read_execonly_key(p1, pkey); -} - -void test_implicit_mprotect_exec_only_memory(int *ptr, u16 pkey) -{ - void *p1; - int scratch; - int ptr_contents; - int ret; - - dprintf1("%s() start\n", __func__); - - p1 = get_pointer_to_instructions(); - lots_o_noops_around_write(&scratch); - ptr_contents = read_ptr(p1); - dprintf2("ptr (%p) contents@%d: %x\n", p1, __LINE__, ptr_contents); - - /* Use a *normal* mprotect(), not mprotect_pkey(): */ - ret = mprotect(p1, PAGE_SIZE, PROT_EXEC); - pkey_assert(!ret); - - /* - * Reset the shadow, assuming that the above mprotect() - * correctly changed PKRU, but to an unknown value since - * the actual allocated pkey is unknown. - */ - shadow_pkey_reg = __read_pkey_reg(); - - dprintf2("pkey_reg: %016llx\n", read_pkey_reg()); - - /* Make sure this is an *instruction* fault */ - madvise(p1, PAGE_SIZE, MADV_DONTNEED); - lots_o_noops_around_write(&scratch); - do_not_expect_pkey_fault("executing on PROT_EXEC memory"); - expect_fault_on_read_execonly_key(p1, UNKNOWN_PKEY); - - /* - * Put the memory back to non-PROT_EXEC. Should clear the - * exec-only pkey off the VMA and allow it to be readable - * again. Go to PROT_NONE first to check for a kernel bug - * that did not clear the pkey when doing PROT_NONE. - */ - ret = mprotect(p1, PAGE_SIZE, PROT_NONE); - pkey_assert(!ret); - - ret = mprotect(p1, PAGE_SIZE, PROT_READ|PROT_EXEC); - pkey_assert(!ret); - ptr_contents = read_ptr(p1); - do_not_expect_pkey_fault("plain read on recently PROT_EXEC area"); -} - -#if defined(__i386__) || defined(__x86_64__) -void test_ptrace_modifies_pkru(int *ptr, u16 pkey) -{ - u32 new_pkru; - pid_t child; - int status, ret; - int pkey_offset = pkey_reg_xstate_offset(); - size_t xsave_size = cpu_max_xsave_size(); - void *xsave; - u32 *pkey_register; - u64 *xstate_bv; - struct iovec iov; - - new_pkru = ~read_pkey_reg(); - /* Don't make PROT_EXEC mappings inaccessible */ - new_pkru &= ~3; - - child = fork(); - pkey_assert(child >= 0); - dprintf3("[%d] fork() ret: %d\n", getpid(), child); - if (!child) { - ptrace(PTRACE_TRACEME, 0, 0, 0); - /* Stop and allow the tracer to modify PKRU directly */ - raise(SIGSTOP); - - /* - * need __read_pkey_reg() version so we do not do shadow_pkey_reg - * checking - */ - if (__read_pkey_reg() != new_pkru) - exit(1); - - /* Stop and allow the tracer to clear XSTATE_BV for PKRU */ - raise(SIGSTOP); - - if (__read_pkey_reg() != 0) - exit(1); - - /* Stop and allow the tracer to examine PKRU */ - raise(SIGSTOP); - - exit(0); - } - - pkey_assert(child == waitpid(child, &status, 0)); - dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); - pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); - - xsave = (void *)malloc(xsave_size); - pkey_assert(xsave > 0); - - /* Modify the PKRU register directly */ - iov.iov_base = xsave; - iov.iov_len = xsave_size; - ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); - pkey_assert(ret == 0); - - pkey_register = (u32 *)(xsave + pkey_offset); - pkey_assert(*pkey_register == read_pkey_reg()); - - *pkey_register = new_pkru; - - ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov); - pkey_assert(ret == 0); - - /* Test that the modification is visible in ptrace before any execution */ - memset(xsave, 0xCC, xsave_size); - ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); - pkey_assert(ret == 0); - pkey_assert(*pkey_register == new_pkru); - - /* Execute the tracee */ - ret = ptrace(PTRACE_CONT, child, 0, 0); - pkey_assert(ret == 0); - - /* Test that the tracee saw the PKRU value change */ - pkey_assert(child == waitpid(child, &status, 0)); - dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); - pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); - - /* Test that the modification is visible in ptrace after execution */ - memset(xsave, 0xCC, xsave_size); - ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); - pkey_assert(ret == 0); - pkey_assert(*pkey_register == new_pkru); - - /* Clear the PKRU bit from XSTATE_BV */ - xstate_bv = (u64 *)(xsave + 512); - *xstate_bv &= ~(1 << 9); - - ret = ptrace(PTRACE_SETREGSET, child, (void *)NT_X86_XSTATE, &iov); - pkey_assert(ret == 0); - - /* Test that the modification is visible in ptrace before any execution */ - memset(xsave, 0xCC, xsave_size); - ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); - pkey_assert(ret == 0); - pkey_assert(*pkey_register == 0); - - ret = ptrace(PTRACE_CONT, child, 0, 0); - pkey_assert(ret == 0); - - /* Test that the tracee saw the PKRU value go to 0 */ - pkey_assert(child == waitpid(child, &status, 0)); - dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); - pkey_assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGSTOP); - - /* Test that the modification is visible in ptrace after execution */ - memset(xsave, 0xCC, xsave_size); - ret = ptrace(PTRACE_GETREGSET, child, (void *)NT_X86_XSTATE, &iov); - pkey_assert(ret == 0); - pkey_assert(*pkey_register == 0); - - ret = ptrace(PTRACE_CONT, child, 0, 0); - pkey_assert(ret == 0); - pkey_assert(child == waitpid(child, &status, 0)); - dprintf3("[%d] waitpid(%d) status: %x\n", getpid(), child, status); - pkey_assert(WIFEXITED(status)); - pkey_assert(WEXITSTATUS(status) == 0); - free(xsave); -} -#endif - -void test_mprotect_pkey_on_unsupported_cpu(int *ptr, u16 pkey) -{ - int size = PAGE_SIZE; - int sret; - - if (cpu_has_pkeys()) { - dprintf1("SKIP: %s: no CPU support\n", __func__); - return; - } - - sret = syscall(SYS_mprotect_key, ptr, size, PROT_READ, pkey); - pkey_assert(sret < 0); -} - -void (*pkey_tests[])(int *ptr, u16 pkey) = { - test_read_of_write_disabled_region, - test_read_of_access_disabled_region, - test_read_of_access_disabled_region_with_page_already_mapped, - test_write_of_write_disabled_region, - test_write_of_write_disabled_region_with_page_already_mapped, - test_write_of_access_disabled_region, - test_write_of_access_disabled_region_with_page_already_mapped, - test_kernel_write_of_access_disabled_region, - test_kernel_write_of_write_disabled_region, - test_kernel_gup_of_access_disabled_region, - test_kernel_gup_write_to_write_disabled_region, - test_executing_on_unreadable_memory, - test_implicit_mprotect_exec_only_memory, - test_mprotect_with_pkey_0, - test_ptrace_of_child, - test_pkey_init_state, - test_pkey_syscalls_on_non_allocated_pkey, - test_pkey_syscalls_bad_args, - test_pkey_alloc_exhaust, - test_pkey_alloc_free_attach_pkey0, -#if defined(__i386__) || defined(__x86_64__) - test_ptrace_modifies_pkru, -#endif -}; - -void run_tests_once(void) -{ - int *ptr; - int prot = PROT_READ|PROT_WRITE; - - for (test_nr = 0; test_nr < ARRAY_SIZE(pkey_tests); test_nr++) { - int pkey; - int orig_pkey_faults = pkey_faults; - - dprintf1("======================\n"); - dprintf1("test %d preparing...\n", test_nr); - - tracing_on(); - pkey = alloc_random_pkey(); - dprintf1("test %d starting with pkey: %d\n", test_nr, pkey); - ptr = malloc_pkey(PAGE_SIZE, prot, pkey); - dprintf1("test %d starting...\n", test_nr); - pkey_tests[test_nr](ptr, pkey); - dprintf1("freeing test memory: %p\n", ptr); - free_pkey_malloc(ptr); - sys_pkey_free(pkey); - - dprintf1("pkey_faults: %d\n", pkey_faults); - dprintf1("orig_pkey_faults: %d\n", orig_pkey_faults); - - tracing_off(); - close_test_fds(); - - printf("test %2d PASSED (iteration %d)\n", test_nr, iteration_nr); - dprintf1("======================\n\n"); - } - iteration_nr++; -} - -void pkey_setup_shadow(void) -{ - shadow_pkey_reg = __read_pkey_reg(); -} - -int main(void) -{ - int nr_iterations = 22; - int pkeys_supported = is_pkeys_supported(); - - srand((unsigned int)time(NULL)); - - setup_handlers(); - - printf("has pkeys: %d\n", pkeys_supported); - - if (!pkeys_supported) { - int size = PAGE_SIZE; - int *ptr; - - printf("running PKEY tests for unsupported CPU/OS\n"); - - ptr = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - assert(ptr != (void *)-1); - test_mprotect_pkey_on_unsupported_cpu(ptr, 1); - exit(0); - } - - pkey_setup_shadow(); - printf("startup pkey_reg: %016llx\n", read_pkey_reg()); - setup_hugetlbfs(); - - while (nr_iterations-- > 0) - run_tests_once(); - - printf("done (all tests OK)\n"); - return 0; -} --- a/tools/testing/selftests/vm/run_vmtests.sh +++ /dev/null @@ -1,274 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# Please run as root - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -exitcode=0 - -usage() { - cat <<EOF -usage: ${BASH_SOURCE[0]:-$0} [ -h | -t "<categories>"] - -t: specify specific categories to tests to run - -h: display this message - -The default behavior is to run all tests. - -Alternatively, specific groups tests can be run by passing a string -to the -t argument containing one or more of the following categories -separated by spaces: -- mmap - tests for mmap(2) -- gup_test - tests for gup using gup_test interface -- userfaultfd - tests for userfaultfd(2) -- compaction - a test for the patch "Allow compaction of unevictable pages" -- mlock - tests for mlock(2) -- mremap - tests for mremap(2) -- hugevm - tests for very large virtual address space -- vmalloc - vmalloc smoke tests -- hmm - hmm smoke tests -- madv_populate - test memadvise(2) MADV_POPULATE_{READ,WRITE} options -- memfd_secret - test memfd_secret(2) -- process_mrelease - test process_mrelease(2) -- ksm - ksm tests that do not require >=2 NUMA nodes -- ksm_numa - ksm tests that require >=2 NUMA nodes -- pkey - memory protection key tests -- soft_dirty - test soft dirty page bit semantics -- cow - test copy-on-write semantics -example: ./run_vmtests.sh -t "hmm mmap ksm" -EOF - exit 0 -} - - -while getopts "ht:" OPT; do - case ${OPT} in - "h") usage ;; - "t") VM_SELFTEST_ITEMS=${OPTARG} ;; - esac -done -shift $((OPTIND -1)) - -# default behavior: run all tests -VM_SELFTEST_ITEMS=${VM_SELFTEST_ITEMS:-default} - -test_selected() { - if [ "$VM_SELFTEST_ITEMS" == "default" ]; then - # If no VM_SELFTEST_ITEMS are specified, run all tests - return 0 - fi - # If test selected argument is one of the test items - if [[ " ${VM_SELFTEST_ITEMS[*]} " =~ " ${1} " ]]; then - return 0 - else - return 1 - fi -} - -# get huge pagesize and freepages from /proc/meminfo -while read -r name size unit; do - if [ "$name" = "HugePages_Free:" ]; then - freepgs="$size" - fi - if [ "$name" = "Hugepagesize:" ]; then - hpgsize_KB="$size" - fi -done < /proc/meminfo - -# Simple hugetlbfs tests have a hardcoded minimum requirement of -# huge pages totaling 256MB (262144KB) in size. The userfaultfd -# hugetlb test requires a minimum of 2 * nr_cpus huge pages. Take -# both of these requirements into account and attempt to increase -# number of huge pages available. -nr_cpus=$(nproc) -hpgsize_MB=$((hpgsize_KB / 1024)) -half_ufd_size_MB=$((((nr_cpus * hpgsize_MB + 127) / 128) * 128)) -needmem_KB=$((half_ufd_size_MB * 2 * 1024)) - -# set proper nr_hugepages -if [ -n "$freepgs" ] && [ -n "$hpgsize_KB" ]; then - nr_hugepgs=$(cat /proc/sys/vm/nr_hugepages) - needpgs=$((needmem_KB / hpgsize_KB)) - tries=2 - while [ "$tries" -gt 0 ] && [ "$freepgs" -lt "$needpgs" ]; do - lackpgs=$((needpgs - freepgs)) - echo 3 > /proc/sys/vm/drop_caches - if ! echo $((lackpgs + nr_hugepgs)) > /proc/sys/vm/nr_hugepages; then - echo "Please run this test as root" - exit $ksft_skip - fi - while read -r name size unit; do - if [ "$name" = "HugePages_Free:" ]; then - freepgs=$size - fi - done < /proc/meminfo - tries=$((tries - 1)) - done - if [ "$freepgs" -lt "$needpgs" ]; then - printf "Not enough huge pages available (%d < %d)\n" \ - "$freepgs" "$needpgs" - exit 1 - fi -else - echo "no hugetlbfs support in kernel?" - exit 1 -fi - -# filter 64bit architectures -ARCH64STR="arm64 ia64 mips64 parisc64 ppc64 ppc64le riscv64 s390x sh64 sparc64 x86_64" -if [ -z "$ARCH" ]; then - ARCH=$(uname -m 2>/dev/null | sed -e 's/aarch64.*/arm64/') -fi -VADDR64=0 -echo "$ARCH64STR" | grep "$ARCH" &>/dev/null && VADDR64=1 - -# Usage: run_test [test binary] [arbitrary test arguments...] -run_test() { - if test_selected ${CATEGORY}; then - local title="running $*" - local sep=$(echo -n "$title" | tr "[:graph:][:space:]" -) - printf "%s\n%s\n%s\n" "$sep" "$title" "$sep" - - "$@" - local ret=$? - if [ $ret -eq 0 ]; then - echo "[PASS]" - elif [ $ret -eq $ksft_skip ]; then - echo "[SKIP]" - exitcode=$ksft_skip - else - echo "[FAIL]" - exitcode=1 - fi - fi # test_selected -} - -CATEGORY="hugetlb" run_test ./hugepage-mmap - -shmmax=$(cat /proc/sys/kernel/shmmax) -shmall=$(cat /proc/sys/kernel/shmall) -echo 268435456 > /proc/sys/kernel/shmmax -echo 4194304 > /proc/sys/kernel/shmall -CATEGORY="hugetlb" run_test ./hugepage-shm -echo "$shmmax" > /proc/sys/kernel/shmmax -echo "$shmall" > /proc/sys/kernel/shmall - -CATEGORY="hugetlb" run_test ./map_hugetlb -CATEGORY="hugetlb" run_test ./hugepage-mremap -CATEGORY="hugetlb" run_test ./hugepage-vmemmap -CATEGORY="hugetlb" run_test ./hugetlb-madvise - -if test_selected "hugetlb"; then - echo "NOTE: These hugetlb tests provide minimal coverage. Use" - echo " https://github.com/libhugetlbfs/libhugetlbfs.git for" - echo " hugetlb regression testing." -fi - -CATEGORY="mmap" run_test ./map_fixed_noreplace - -# get_user_pages_fast() benchmark -CATEGORY="gup_test" run_test ./gup_test -u -# pin_user_pages_fast() benchmark -CATEGORY="gup_test" run_test ./gup_test -a -# Dump pages 0, 19, and 4096, using pin_user_pages: -CATEGORY="gup_test" run_test ./gup_test -ct -F 0x1 0 19 0x1000 - -uffd_mods=("" ":dev") -for mod in "${uffd_mods[@]}"; do - CATEGORY="userfaultfd" run_test ./userfaultfd anon${mod} 20 16 - # Hugetlb tests require source and destination huge pages. Pass in half - # the size ($half_ufd_size_MB), which is used for *each*. - CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32 - CATEGORY="userfaultfd" run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32 - CATEGORY="userfaultfd" run_test ./userfaultfd shmem${mod} 20 16 -done - -#cleanup -echo "$nr_hugepgs" > /proc/sys/vm/nr_hugepages - -CATEGORY="compaction" run_test ./compaction_test - -CATEGORY="mlock" run_test sudo -u nobody ./on-fault-limit - -CATEGORY="mmap" run_test ./map_populate - -CATEGORY="mlock" run_test ./mlock-random-test - -CATEGORY="mlock" run_test ./mlock2-tests - -CATEGORY="process_mrelease" run_test ./mrelease_test - -CATEGORY="mremap" run_test ./mremap_test - -CATEGORY="hugetlb" run_test ./thuge-gen - -if [ $VADDR64 -ne 0 ]; then - CATEGORY="hugevm" run_test ./virtual_address_range - - # virtual address 128TB switch test - CATEGORY="hugevm" run_test ./va_128TBswitch.sh -fi # VADDR64 - -# vmalloc stability smoke test -CATEGORY="vmalloc" run_test ./test_vmalloc.sh smoke - -CATEGORY="mremap" run_test ./mremap_dontunmap - -CATEGORY="hmm" run_test ./test_hmm.sh smoke - -# MADV_POPULATE_READ and MADV_POPULATE_WRITE tests -CATEGORY="madv_populate" run_test ./madv_populate - -CATEGORY="memfd_secret" run_test ./memfd_secret - -# KSM MADV_MERGEABLE test with 10 identical pages -CATEGORY="ksm" run_test ./ksm_tests -M -p 10 -# KSM unmerge test -CATEGORY="ksm" run_test ./ksm_tests -U -# KSM test with 10 zero pages and use_zero_pages = 0 -CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 0 -# KSM test with 10 zero pages and use_zero_pages = 1 -CATEGORY="ksm" run_test ./ksm_tests -Z -p 10 -z 1 -# KSM test with 2 NUMA nodes and merge_across_nodes = 1 -CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 1 -# KSM test with 2 NUMA nodes and merge_across_nodes = 0 -CATEGORY="ksm_numa" run_test ./ksm_tests -N -m 0 - -CATEGORY="ksm" run_test ./ksm_functional_tests - -run_test ./ksm_functional_tests - -# protection_keys tests -if [ -x ./protection_keys_32 ] -then - CATEGORY="pkey" run_test ./protection_keys_32 -fi - -if [ -x ./protection_keys_64 ] -then - CATEGORY="pkey" run_test ./protection_keys_64 -fi - -CATEGORY="soft_dirty" run_test ./soft-dirty - -# COW tests -CATEGORY="cow" run_test ./cow - -exit $exitcode --- a/tools/testing/selftests/vm/settings +++ /dev/null @@ -1 +0,0 @@ -timeout=45 --- a/tools/testing/selftests/vm/soft-dirty.c +++ /dev/null @@ -1,210 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <stdio.h> -#include <string.h> -#include <stdbool.h> -#include <fcntl.h> -#include <stdint.h> -#include <malloc.h> -#include <sys/mman.h> -#include "../kselftest.h" -#include "vm_util.h" - -#define PAGEMAP_FILE_PATH "/proc/self/pagemap" -#define TEST_ITERATIONS 10000 - -static void test_simple(int pagemap_fd, int pagesize) -{ - int i; - char *map; - - map = aligned_alloc(pagesize, pagesize); - if (!map) - ksft_exit_fail_msg("mmap failed\n"); - - clear_softdirty(); - - for (i = 0 ; i < TEST_ITERATIONS; i++) { - if (pagemap_is_softdirty(pagemap_fd, map) == 1) { - ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i); - break; - } - - clear_softdirty(); - // Write something to the page to get the dirty bit enabled on the page - map[0]++; - - if (pagemap_is_softdirty(pagemap_fd, map) == 0) { - ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i); - break; - } - - clear_softdirty(); - } - free(map); - - ksft_test_result(i == TEST_ITERATIONS, "Test %s\n", __func__); -} - -static void test_vma_reuse(int pagemap_fd, int pagesize) -{ - char *map, *map2; - - map = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0); - if (map == MAP_FAILED) - ksft_exit_fail_msg("mmap failed"); - - // The kernel always marks new regions as soft dirty - ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, - "Test %s dirty bit of allocated page\n", __func__); - - clear_softdirty(); - munmap(map, pagesize); - - map2 = mmap(NULL, pagesize, (PROT_READ | PROT_WRITE), (MAP_PRIVATE | MAP_ANON), -1, 0); - if (map2 == MAP_FAILED) - ksft_exit_fail_msg("mmap failed"); - - // Dirty bit is set for new regions even if they are reused - if (map == map2) - ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1, - "Test %s dirty bit of reused address page\n", __func__); - else - ksft_test_result_skip("Test %s dirty bit of reused address page\n", __func__); - - munmap(map2, pagesize); -} - -static void test_hugepage(int pagemap_fd, int pagesize) -{ - char *map; - int i, ret; - size_t hpage_len = read_pmd_pagesize(); - - map = memalign(hpage_len, hpage_len); - if (!map) - ksft_exit_fail_msg("memalign failed\n"); - - ret = madvise(map, hpage_len, MADV_HUGEPAGE); - if (ret) - ksft_exit_fail_msg("madvise failed %d\n", ret); - - for (i = 0; i < hpage_len; i++) - map[i] = (char)i; - - if (check_huge_anon(map, 1, hpage_len)) { - ksft_test_result_pass("Test %s huge page allocation\n", __func__); - - clear_softdirty(); - for (i = 0 ; i < TEST_ITERATIONS ; i++) { - if (pagemap_is_softdirty(pagemap_fd, map) == 1) { - ksft_print_msg("dirty bit was 1, but should be 0 (i=%d)\n", i); - break; - } - - clear_softdirty(); - // Write something to the page to get the dirty bit enabled on the page - map[0]++; - - if (pagemap_is_softdirty(pagemap_fd, map) == 0) { - ksft_print_msg("dirty bit was 0, but should be 1 (i=%d)\n", i); - break; - } - clear_softdirty(); - } - - ksft_test_result(i == TEST_ITERATIONS, "Test %s huge page dirty bit\n", __func__); - } else { - // hugepage allocation failed. skip these tests - ksft_test_result_skip("Test %s huge page allocation\n", __func__); - ksft_test_result_skip("Test %s huge page dirty bit\n", __func__); - } - free(map); -} - -static void test_mprotect(int pagemap_fd, int pagesize, bool anon) -{ - const char *type[] = {"file", "anon"}; - const char *fname = "./soft-dirty-test-file"; - int test_fd; - char *map; - - if (anon) { - map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, - MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); - if (!map) - ksft_exit_fail_msg("anon mmap failed\n"); - } else { - test_fd = open(fname, O_RDWR | O_CREAT); - if (test_fd < 0) { - ksft_test_result_skip("Test %s open() file failed\n", __func__); - return; - } - unlink(fname); - ftruncate(test_fd, pagesize); - map = mmap(NULL, pagesize, PROT_READ|PROT_WRITE, - MAP_SHARED, test_fd, 0); - if (!map) - ksft_exit_fail_msg("file mmap failed\n"); - } - - *map = 1; - ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, - "Test %s-%s dirty bit of new written page\n", - __func__, type[anon]); - clear_softdirty(); - ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, - "Test %s-%s soft-dirty clear after clear_refs\n", - __func__, type[anon]); - mprotect(map, pagesize, PROT_READ); - ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, - "Test %s-%s soft-dirty clear after marking RO\n", - __func__, type[anon]); - mprotect(map, pagesize, PROT_READ|PROT_WRITE); - ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 0, - "Test %s-%s soft-dirty clear after marking RW\n", - __func__, type[anon]); - *map = 2; - ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, - "Test %s-%s soft-dirty after rewritten\n", - __func__, type[anon]); - - munmap(map, pagesize); - - if (!anon) - close(test_fd); -} - -static void test_mprotect_anon(int pagemap_fd, int pagesize) -{ - test_mprotect(pagemap_fd, pagesize, true); -} - -static void test_mprotect_file(int pagemap_fd, int pagesize) -{ - test_mprotect(pagemap_fd, pagesize, false); -} - -int main(int argc, char **argv) -{ - int pagemap_fd; - int pagesize; - - ksft_print_header(); - ksft_set_plan(15); - - pagemap_fd = open(PAGEMAP_FILE_PATH, O_RDONLY); - if (pagemap_fd < 0) - ksft_exit_fail_msg("Failed to open %s\n", PAGEMAP_FILE_PATH); - - pagesize = getpagesize(); - - test_simple(pagemap_fd, pagesize); - test_vma_reuse(pagemap_fd, pagesize); - test_hugepage(pagemap_fd, pagesize); - test_mprotect_anon(pagemap_fd, pagesize); - test_mprotect_file(pagemap_fd, pagesize); - - close(pagemap_fd); - - return ksft_exit_pass(); -} --- a/tools/testing/selftests/vm/split_huge_page_test.c +++ /dev/null @@ -1,309 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * A test of splitting PMD THPs and PTE-mapped THPs from a specified virtual - * address range in a process via <debugfs>/split_huge_pages interface. - */ - -#define _GNU_SOURCE -#include <stdio.h> -#include <stdlib.h> -#include <stdarg.h> -#include <unistd.h> -#include <inttypes.h> -#include <string.h> -#include <fcntl.h> -#include <sys/mman.h> -#include <sys/mount.h> -#include <malloc.h> -#include <stdbool.h> -#include "vm_util.h" - -uint64_t pagesize; -unsigned int pageshift; -uint64_t pmd_pagesize; - -#define SPLIT_DEBUGFS "/sys/kernel/debug/split_huge_pages" -#define INPUT_MAX 80 - -#define PID_FMT "%d,0x%lx,0x%lx" -#define PATH_FMT "%s,0x%lx,0x%lx" - -#define PFN_MASK ((1UL<<55)-1) -#define KPF_THP (1UL<<22) - -int is_backed_by_thp(char *vaddr, int pagemap_file, int kpageflags_file) -{ - uint64_t paddr; - uint64_t page_flags; - - if (pagemap_file) { - pread(pagemap_file, &paddr, sizeof(paddr), - ((long)vaddr >> pageshift) * sizeof(paddr)); - - if (kpageflags_file) { - pread(kpageflags_file, &page_flags, sizeof(page_flags), - (paddr & PFN_MASK) * sizeof(page_flags)); - - return !!(page_flags & KPF_THP); - } - } - return 0; -} - -static int write_file(const char *path, const char *buf, size_t buflen) -{ - int fd; - ssize_t numwritten; - - fd = open(path, O_WRONLY); - if (fd == -1) - return 0; - - numwritten = write(fd, buf, buflen - 1); - close(fd); - if (numwritten < 1) - return 0; - - return (unsigned int) numwritten; -} - -static void write_debugfs(const char *fmt, ...) -{ - char input[INPUT_MAX]; - int ret; - va_list argp; - - va_start(argp, fmt); - ret = vsnprintf(input, INPUT_MAX, fmt, argp); - va_end(argp); - - if (ret >= INPUT_MAX) { - printf("%s: Debugfs input is too long\n", __func__); - exit(EXIT_FAILURE); - } - - if (!write_file(SPLIT_DEBUGFS, input, ret + 1)) { - perror(SPLIT_DEBUGFS); - exit(EXIT_FAILURE); - } -} - -void split_pmd_thp(void) -{ - char *one_page; - size_t len = 4 * pmd_pagesize; - size_t i; - - one_page = memalign(pmd_pagesize, len); - - if (!one_page) { - printf("Fail to allocate memory\n"); - exit(EXIT_FAILURE); - } - - madvise(one_page, len, MADV_HUGEPAGE); - - for (i = 0; i < len; i++) - one_page[i] = (char)i; - - if (!check_huge_anon(one_page, 1, pmd_pagesize)) { - printf("No THP is allocated\n"); - exit(EXIT_FAILURE); - } - - /* split all THPs */ - write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, - (uint64_t)one_page + len); - - for (i = 0; i < len; i++) - if (one_page[i] != (char)i) { - printf("%ld byte corrupted\n", i); - exit(EXIT_FAILURE); - } - - - if (check_huge_anon(one_page, 0, pmd_pagesize)) { - printf("Still AnonHugePages not split\n"); - exit(EXIT_FAILURE); - } - - printf("Split huge pages successful\n"); - free(one_page); -} - -void split_pte_mapped_thp(void) -{ - char *one_page, *pte_mapped, *pte_mapped2; - size_t len = 4 * pmd_pagesize; - uint64_t thp_size; - size_t i; - const char *pagemap_template = "/proc/%d/pagemap"; - const char *kpageflags_proc = "/proc/kpageflags"; - char pagemap_proc[255]; - int pagemap_fd; - int kpageflags_fd; - - if (snprintf(pagemap_proc, 255, pagemap_template, getpid()) < 0) { - perror("get pagemap proc error"); - exit(EXIT_FAILURE); - } - pagemap_fd = open(pagemap_proc, O_RDONLY); - - if (pagemap_fd == -1) { - perror("read pagemap:"); - exit(EXIT_FAILURE); - } - - kpageflags_fd = open(kpageflags_proc, O_RDONLY); - - if (kpageflags_fd == -1) { - perror("read kpageflags:"); - exit(EXIT_FAILURE); - } - - one_page = mmap((void *)(1UL << 30), len, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - - madvise(one_page, len, MADV_HUGEPAGE); - - for (i = 0; i < len; i++) - one_page[i] = (char)i; - - if (!check_huge_anon(one_page, 1, pmd_pagesize)) { - printf("No THP is allocated\n"); - exit(EXIT_FAILURE); - } - - /* remap the first pagesize of first THP */ - pte_mapped = mremap(one_page, pagesize, pagesize, MREMAP_MAYMOVE); - - /* remap the Nth pagesize of Nth THP */ - for (i = 1; i < 4; i++) { - pte_mapped2 = mremap(one_page + pmd_pagesize * i + pagesize * i, - pagesize, pagesize, - MREMAP_MAYMOVE|MREMAP_FIXED, - pte_mapped + pagesize * i); - if (pte_mapped2 == (char *)-1) { - perror("mremap failed"); - exit(EXIT_FAILURE); - } - } - - /* smap does not show THPs after mremap, use kpageflags instead */ - thp_size = 0; - for (i = 0; i < pagesize * 4; i++) - if (i % pagesize == 0 && - is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) - thp_size++; - - if (thp_size != 4) { - printf("Some THPs are missing during mremap\n"); - exit(EXIT_FAILURE); - } - - /* split all remapped THPs */ - write_debugfs(PID_FMT, getpid(), (uint64_t)pte_mapped, - (uint64_t)pte_mapped + pagesize * 4); - - /* smap does not show THPs after mremap, use kpageflags instead */ - thp_size = 0; - for (i = 0; i < pagesize * 4; i++) { - if (pte_mapped[i] != (char)i) { - printf("%ld byte corrupted\n", i); - exit(EXIT_FAILURE); - } - if (i % pagesize == 0 && - is_backed_by_thp(&pte_mapped[i], pagemap_fd, kpageflags_fd)) - thp_size++; - } - - if (thp_size) { - printf("Still %ld THPs not split\n", thp_size); - exit(EXIT_FAILURE); - } - - printf("Split PTE-mapped huge pages successful\n"); - munmap(one_page, len); - close(pagemap_fd); - close(kpageflags_fd); -} - -void split_file_backed_thp(void) -{ - int status; - int fd; - ssize_t num_written; - char tmpfs_template[] = "/tmp/thp_split_XXXXXX"; - const char *tmpfs_loc = mkdtemp(tmpfs_template); - char testfile[INPUT_MAX]; - uint64_t pgoff_start = 0, pgoff_end = 1024; - - printf("Please enable pr_debug in split_huge_pages_in_file() if you need more info.\n"); - - status = mount("tmpfs", tmpfs_loc, "tmpfs", 0, "huge=always,size=4m"); - - if (status) { - printf("Unable to create a tmpfs for testing\n"); - exit(EXIT_FAILURE); - } - - status = snprintf(testfile, INPUT_MAX, "%s/thp_file", tmpfs_loc); - if (status >= INPUT_MAX) { - printf("Fail to create file-backed THP split testing file\n"); - goto cleanup; - } - - fd = open(testfile, O_CREAT|O_WRONLY); - if (fd == -1) { - perror("Cannot open testing file\n"); - goto cleanup; - } - - /* write something to the file, so a file-backed THP can be allocated */ - num_written = write(fd, tmpfs_loc, strlen(tmpfs_loc) + 1); - close(fd); - - if (num_written < 1) { - printf("Fail to write data to testing file\n"); - goto cleanup; - } - - /* split the file-backed THP */ - write_debugfs(PATH_FMT, testfile, pgoff_start, pgoff_end); - - status = unlink(testfile); - if (status) - perror("Cannot remove testing file\n"); - -cleanup: - status = umount(tmpfs_loc); - if (status) { - printf("Unable to umount %s\n", tmpfs_loc); - exit(EXIT_FAILURE); - } - status = rmdir(tmpfs_loc); - if (status) { - perror("cannot remove tmp dir"); - exit(EXIT_FAILURE); - } - - printf("file-backed THP split test done, please check dmesg for more information\n"); -} - -int main(int argc, char **argv) -{ - if (geteuid() != 0) { - printf("Please run the benchmark as root\n"); - exit(EXIT_FAILURE); - } - - pagesize = getpagesize(); - pageshift = ffs(pagesize) - 1; - pmd_pagesize = read_pmd_pagesize(); - - split_pmd_thp(); - split_pte_mapped_thp(); - split_file_backed_thp(); - - return 0; -} --- a/tools/testing/selftests/vm/test_hmm.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@xxxxxxxxx> -# -# This is a test script for the kernel test driver to analyse vmalloc -# allocator. Therefore it is just a kernel module loader. You can specify -# and pass different parameters in order to: -# a) analyse performance of vmalloc allocations; -# b) stressing and stability check of vmalloc subsystem. - -TEST_NAME="test_hmm" -DRIVER="test_hmm" - -# 1 if fails -exitcode=1 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -check_test_requirements() -{ - uid=$(id -u) - if [ $uid -ne 0 ]; then - echo "$0: Must be run as root" - exit $ksft_skip - fi - - if ! which modprobe > /dev/null 2>&1; then - echo "$0: You need modprobe installed" - exit $ksft_skip - fi - - if ! modinfo $DRIVER > /dev/null 2>&1; then - echo "$0: You must have the following enabled in your kernel:" - echo "CONFIG_TEST_HMM=m" - exit $ksft_skip - fi -} - -load_driver() -{ - if [ $# -eq 0 ]; then - modprobe $DRIVER > /dev/null 2>&1 - else - if [ $# -eq 2 ]; then - modprobe $DRIVER spm_addr_dev0=$1 spm_addr_dev1=$2 - > /dev/null 2>&1 - else - echo "Missing module parameters. Make sure pass"\ - "spm_addr_dev0 and spm_addr_dev1" - usage - fi - fi -} - -unload_driver() -{ - modprobe -r $DRIVER > /dev/null 2>&1 -} - -run_smoke() -{ - echo "Running smoke test. Note, this test provides basic coverage." - - load_driver $1 $2 - $(dirname "${BASH_SOURCE[0]}")/hmm-tests - unload_driver -} - -usage() -{ - echo -n "Usage: $0" - echo - echo "Example usage:" - echo - echo "# Shows help message" - echo "./${TEST_NAME}.sh" - echo - echo "# Smoke testing" - echo "./${TEST_NAME}.sh smoke" - echo - echo "# Smoke testing with SPM enabled" - echo "./${TEST_NAME}.sh smoke <spm_addr_dev0> <spm_addr_dev1>" - echo - exit 0 -} - -function run_test() -{ - if [ $# -eq 0 ]; then - usage - else - if [ "$1" = "smoke" ]; then - run_smoke $2 $3 - else - usage - fi - fi -} - -check_test_requirements -run_test $@ - -exit 0 --- a/tools/testing/selftests/vm/test_vmalloc.sh +++ /dev/null @@ -1,177 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Copyright (C) 2018 Uladzislau Rezki (Sony) <urezki@xxxxxxxxx> -# -# This is a test script for the kernel test driver to analyse vmalloc -# allocator. Therefore it is just a kernel module loader. You can specify -# and pass different parameters in order to: -# a) analyse performance of vmalloc allocations; -# b) stressing and stability check of vmalloc subsystem. - -TEST_NAME="vmalloc" -DRIVER="test_${TEST_NAME}" -NUM_CPUS=`grep -c ^processor /proc/cpuinfo` - -# 1 if fails -exitcode=1 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -# -# Static templates for performance, stressing and smoke tests. -# Also it is possible to pass any supported parameters manualy. -# -PERF_PARAM="sequential_test_order=1 test_repeat_count=3" -SMOKE_PARAM="test_loop_count=10000 test_repeat_count=10" -STRESS_PARAM="nr_threads=$NUM_CPUS test_repeat_count=20" - -check_test_requirements() -{ - uid=$(id -u) - if [ $uid -ne 0 ]; then - echo "$0: Must be run as root" - exit $ksft_skip - fi - - if ! which modprobe > /dev/null 2>&1; then - echo "$0: You need modprobe installed" - exit $ksft_skip - fi - - if ! modinfo $DRIVER > /dev/null 2>&1; then - echo "$0: You must have the following enabled in your kernel:" - echo "CONFIG_TEST_VMALLOC=m" - exit $ksft_skip - fi -} - -run_perfformance_check() -{ - echo "Run performance tests to evaluate how fast vmalloc allocation is." - echo "It runs all test cases on one single CPU with sequential order." - - modprobe $DRIVER $PERF_PARAM > /dev/null 2>&1 - echo "Done." - echo "Ccheck the kernel message buffer to see the summary." -} - -run_stability_check() -{ - echo "Run stability tests. In order to stress vmalloc subsystem all" - echo "available test cases are run by NUM_CPUS workers simultaneously." - echo "It will take time, so be patient." - - modprobe $DRIVER $STRESS_PARAM > /dev/null 2>&1 - echo "Done." - echo "Check the kernel ring buffer to see the summary." -} - -run_smoke_check() -{ - echo "Run smoke test. Note, this test provides basic coverage." - echo "Please check $0 output how it can be used" - echo "for deep performance analysis as well as stress testing." - - modprobe $DRIVER $SMOKE_PARAM > /dev/null 2>&1 - echo "Done." - echo "Check the kernel ring buffer to see the summary." -} - -usage() -{ - echo -n "Usage: $0 [ performance ] | [ stress ] | | [ smoke ] | " - echo "manual parameters" - echo - echo "Valid tests and parameters:" - echo - modinfo $DRIVER - echo - echo "Example usage:" - echo - echo "# Shows help message" - echo "./${DRIVER}.sh" - echo - echo "# Runs 1 test(id_1), repeats it 5 times by NUM_CPUS workers" - echo "./${DRIVER}.sh nr_threads=$NUM_CPUS run_test_mask=1 test_repeat_count=5" - echo - echo -n "# Runs 4 tests(id_1|id_2|id_4|id_16) on one CPU with " - echo "sequential order" - echo -n "./${DRIVER}.sh sequential_test_order=1 " - echo "run_test_mask=23" - echo - echo -n "# Runs all tests by NUM_CPUS workers, shuffled order, repeats " - echo "20 times" - echo "./${DRIVER}.sh nr_threads=$NUM_CPUS test_repeat_count=20" - echo - echo "# Performance analysis" - echo "./${DRIVER}.sh performance" - echo - echo "# Stress testing" - echo "./${DRIVER}.sh stress" - echo - exit 0 -} - -function validate_passed_args() -{ - VALID_ARGS=`modinfo $DRIVER | awk '/parm:/ {print $2}' | sed 's/:.*//'` - - # - # Something has been passed, check it. - # - for passed_arg in $@; do - key=${passed_arg//=*/} - val="${passed_arg:$((${#key}+1))}" - valid=0 - - for valid_arg in $VALID_ARGS; do - if [[ $key = $valid_arg ]] && [[ $val -gt 0 ]]; then - valid=1 - break - fi - done - - if [[ $valid -ne 1 ]]; then - echo "Error: key or value is not correct: ${key} $val" - exit $exitcode - fi - done -} - -function run_manual_check() -{ - # - # Validate passed parameters. If there is wrong one, - # the script exists and does not execute further. - # - validate_passed_args $@ - - echo "Run the test with following parameters: $@" - modprobe $DRIVER $@ > /dev/null 2>&1 - echo "Done." - echo "Check the kernel ring buffer to see the summary." -} - -function run_test() -{ - if [ $# -eq 0 ]; then - usage - else - if [[ "$1" = "performance" ]]; then - run_perfformance_check - elif [[ "$1" = "stress" ]]; then - run_stability_check - elif [[ "$1" = "smoke" ]]; then - run_smoke_check - else - run_manual_check $@ - fi - fi -} - -check_test_requirements -run_test $@ - -exit 0 --- a/tools/testing/selftests/vm/thuge-gen.c +++ /dev/null @@ -1,257 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Test selecting other page sizes for mmap/shmget. - - Before running this huge pages for each huge page size must have been - reserved. - For large pages beyond MAX_ORDER (like 1GB on x86) boot options must be used. - Also shmmax must be increased. - And you need to run as root to work around some weird permissions in shm. - And nothing using huge pages should run in parallel. - When the program aborts you may need to clean up the shm segments with - ipcrm -m by hand, like this - sudo ipcs | awk '$1 == "0x00000000" {print $2}' | xargs -n1 sudo ipcrm -m - (warning this will remove all if someone else uses them) */ - -#define _GNU_SOURCE 1 -#include <sys/mman.h> -#include <stdlib.h> -#include <stdio.h> -#include <sys/ipc.h> -#include <sys/shm.h> -#include <sys/stat.h> -#include <glob.h> -#include <assert.h> -#include <unistd.h> -#include <stdarg.h> -#include <string.h> - -#define err(x) perror(x), exit(1) - -#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) -#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) -#define MAP_HUGE_SHIFT 26 -#define MAP_HUGE_MASK 0x3f -#if !defined(MAP_HUGETLB) -#define MAP_HUGETLB 0x40000 -#endif - -#define SHM_HUGETLB 04000 /* segment will use huge TLB pages */ -#define SHM_HUGE_SHIFT 26 -#define SHM_HUGE_MASK 0x3f -#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT) -#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT) - -#define NUM_PAGESIZES 5 - -#define NUM_PAGES 4 - -#define Dprintf(fmt...) // printf(fmt) - -unsigned long page_sizes[NUM_PAGESIZES]; -int num_page_sizes; - -int ilog2(unsigned long v) -{ - int l = 0; - while ((1UL << l) < v) - l++; - return l; -} - -void find_pagesizes(void) -{ - glob_t g; - int i; - glob("/sys/kernel/mm/hugepages/hugepages-*kB", 0, NULL, &g); - assert(g.gl_pathc <= NUM_PAGESIZES); - for (i = 0; i < g.gl_pathc; i++) { - sscanf(g.gl_pathv[i], "/sys/kernel/mm/hugepages/hugepages-%lukB", - &page_sizes[i]); - page_sizes[i] <<= 10; - printf("Found %luMB\n", page_sizes[i] >> 20); - } - num_page_sizes = g.gl_pathc; - globfree(&g); -} - -unsigned long default_huge_page_size(void) -{ - unsigned long hps = 0; - char *line = NULL; - size_t linelen = 0; - FILE *f = fopen("/proc/meminfo", "r"); - if (!f) - return 0; - while (getline(&line, &linelen, f) > 0) { - if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { - hps <<= 10; - break; - } - } - free(line); - return hps; -} - -void show(unsigned long ps) -{ - char buf[100]; - if (ps == getpagesize()) - return; - printf("%luMB: ", ps >> 20); - fflush(stdout); - snprintf(buf, sizeof buf, - "cat /sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", - ps >> 10); - system(buf); -} - -unsigned long read_sysfs(int warn, char *fmt, ...) -{ - char *line = NULL; - size_t linelen = 0; - char buf[100]; - FILE *f; - va_list ap; - unsigned long val = 0; - - va_start(ap, fmt); - vsnprintf(buf, sizeof buf, fmt, ap); - va_end(ap); - - f = fopen(buf, "r"); - if (!f) { - if (warn) - printf("missing %s\n", buf); - return 0; - } - if (getline(&line, &linelen, f) > 0) { - sscanf(line, "%lu", &val); - } - fclose(f); - free(line); - return val; -} - -unsigned long read_free(unsigned long ps) -{ - return read_sysfs(ps != getpagesize(), - "/sys/kernel/mm/hugepages/hugepages-%lukB/free_hugepages", - ps >> 10); -} - -void test_mmap(unsigned long size, unsigned flags) -{ - char *map; - unsigned long before, after; - int err; - - before = read_free(size); - map = mmap(NULL, size*NUM_PAGES, PROT_READ|PROT_WRITE, - MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB|flags, -1, 0); - - if (map == (char *)-1) err("mmap"); - memset(map, 0xff, size*NUM_PAGES); - after = read_free(size); - Dprintf("before %lu after %lu diff %ld size %lu\n", - before, after, before - after, size); - assert(size == getpagesize() || (before - after) == NUM_PAGES); - show(size); - err = munmap(map, size); - assert(!err); -} - -void test_shmget(unsigned long size, unsigned flags) -{ - int id; - unsigned long before, after; - int err; - - before = read_free(size); - id = shmget(IPC_PRIVATE, size * NUM_PAGES, IPC_CREAT|0600|flags); - if (id < 0) err("shmget"); - - struct shm_info i; - if (shmctl(id, SHM_INFO, (void *)&i) < 0) err("shmctl"); - Dprintf("alloc %lu res %lu\n", i.shm_tot, i.shm_rss); - - - Dprintf("id %d\n", id); - char *map = shmat(id, NULL, 0600); - if (map == (char*)-1) err("shmat"); - - shmctl(id, IPC_RMID, NULL); - - memset(map, 0xff, size*NUM_PAGES); - after = read_free(size); - - Dprintf("before %lu after %lu diff %ld size %lu\n", - before, after, before - after, size); - assert(size == getpagesize() || (before - after) == NUM_PAGES); - show(size); - err = shmdt(map); - assert(!err); -} - -void sanity_checks(void) -{ - int i; - unsigned long largest = getpagesize(); - - for (i = 0; i < num_page_sizes; i++) { - if (page_sizes[i] > largest) - largest = page_sizes[i]; - - if (read_free(page_sizes[i]) < NUM_PAGES) { - printf("Not enough huge pages for page size %lu MB, need %u\n", - page_sizes[i] >> 20, - NUM_PAGES); - exit(0); - } - } - - if (read_sysfs(0, "/proc/sys/kernel/shmmax") < NUM_PAGES * largest) { - printf("Please do echo %lu > /proc/sys/kernel/shmmax", largest * NUM_PAGES); - exit(0); - } - -#if defined(__x86_64__) - if (largest != 1U<<30) { - printf("No GB pages available on x86-64\n" - "Please boot with hugepagesz=1G hugepages=%d\n", NUM_PAGES); - exit(0); - } -#endif -} - -int main(void) -{ - int i; - unsigned default_hps = default_huge_page_size(); - - find_pagesizes(); - - sanity_checks(); - - for (i = 0; i < num_page_sizes; i++) { - unsigned long ps = page_sizes[i]; - int arg = ilog2(ps) << MAP_HUGE_SHIFT; - printf("Testing %luMB mmap with shift %x\n", ps >> 20, arg); - test_mmap(ps, MAP_HUGETLB | arg); - } - printf("Testing default huge mmap\n"); - test_mmap(default_hps, SHM_HUGETLB); - - puts("Testing non-huge shmget"); - test_shmget(getpagesize(), 0); - - for (i = 0; i < num_page_sizes; i++) { - unsigned long ps = page_sizes[i]; - int arg = ilog2(ps) << SHM_HUGE_SHIFT; - printf("Testing %luMB shmget with shift %x\n", ps >> 20, arg); - test_shmget(ps, SHM_HUGETLB | arg); - } - puts("default huge shmget"); - test_shmget(default_hps, SHM_HUGETLB); - - return 0; -} --- a/tools/testing/selftests/vm/transhuge-stress.c +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Stress test for transparent huge pages, memory compaction and migration. - * - * Authors: Konstantin Khlebnikov <koct9i@xxxxxxxxx> - * - * This is free and unencumbered software released into the public domain. - */ - -#include <stdlib.h> -#include <stdio.h> -#include <stdint.h> -#include <err.h> -#include <time.h> -#include <unistd.h> -#include <fcntl.h> -#include <string.h> -#include <sys/mman.h> -#include "util.h" - -int backing_fd = -1; -int mmap_flags = MAP_ANONYMOUS | MAP_NORESERVE | MAP_PRIVATE; -#define PROT_RW (PROT_READ | PROT_WRITE) - -int main(int argc, char **argv) -{ - size_t ram, len; - void *ptr, *p; - struct timespec a, b; - int i = 0; - char *name = NULL; - double s; - uint8_t *map; - size_t map_len; - int pagemap_fd; - - ram = sysconf(_SC_PHYS_PAGES); - if (ram > SIZE_MAX / sysconf(_SC_PAGESIZE) / 4) - ram = SIZE_MAX / 4; - else - ram *= sysconf(_SC_PAGESIZE); - len = ram; - - while (++i < argc) { - if (!strcmp(argv[i], "-h")) - errx(1, "usage: %s [size in MiB]", argv[0]); - else if (!strcmp(argv[i], "-f")) - name = argv[++i]; - else - len = atoll(argv[i]) << 20; - } - - if (name) { - backing_fd = open(name, O_RDWR); - if (backing_fd == -1) - errx(2, "open %s", name); - mmap_flags = MAP_SHARED; - } - - warnx("allocate %zd transhuge pages, using %zd MiB virtual memory" - " and %zd MiB of ram", len >> HPAGE_SHIFT, len >> 20, - ram >> (20 + HPAGE_SHIFT - PAGE_SHIFT - 1)); - - pagemap_fd = open("/proc/self/pagemap", O_RDONLY); - if (pagemap_fd < 0) - err(2, "open pagemap"); - - len -= len % HPAGE_SIZE; - ptr = mmap(NULL, len + HPAGE_SIZE, PROT_RW, mmap_flags, backing_fd, 0); - if (ptr == MAP_FAILED) - err(2, "initial mmap"); - ptr += HPAGE_SIZE - (uintptr_t)ptr % HPAGE_SIZE; - - if (madvise(ptr, len, MADV_HUGEPAGE)) - err(2, "MADV_HUGEPAGE"); - - map_len = ram >> (HPAGE_SHIFT - 1); - map = malloc(map_len); - if (!map) - errx(2, "map malloc"); - - while (1) { - int nr_succeed = 0, nr_failed = 0, nr_pages = 0; - - memset(map, 0, map_len); - - clock_gettime(CLOCK_MONOTONIC, &a); - for (p = ptr; p < ptr + len; p += HPAGE_SIZE) { - int64_t pfn; - - pfn = allocate_transhuge(p, pagemap_fd); - - if (pfn < 0) { - nr_failed++; - } else { - size_t idx = pfn >> (HPAGE_SHIFT - PAGE_SHIFT); - - nr_succeed++; - if (idx >= map_len) { - map = realloc(map, idx + 1); - if (!map) - errx(2, "map realloc"); - memset(map + map_len, 0, idx + 1 - map_len); - map_len = idx + 1; - } - if (!map[idx]) - nr_pages++; - map[idx] = 1; - } - - /* split transhuge page, keep last page */ - if (madvise(p, HPAGE_SIZE - PAGE_SIZE, MADV_DONTNEED)) - err(2, "MADV_DONTNEED"); - } - clock_gettime(CLOCK_MONOTONIC, &b); - s = b.tv_sec - a.tv_sec + (b.tv_nsec - a.tv_nsec) / 1000000000.; - - warnx("%.3f s/loop, %.3f ms/page, %10.3f MiB/s\t" - "%4d succeed, %4d failed, %4d different pages", - s, s * 1000 / (len >> HPAGE_SHIFT), len / s / (1 << 20), - nr_succeed, nr_failed, nr_pages); - } -} --- a/tools/testing/selftests/vm/userfaultfd.c +++ /dev/null @@ -1,1858 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Stress userfaultfd syscall. - * - * Copyright (C) 2015 Red Hat, Inc. - * - * This test allocates two virtual areas and bounces the physical - * memory across the two virtual areas (from area_src to area_dst) - * using userfaultfd. - * - * There are three threads running per CPU: - * - * 1) one per-CPU thread takes a per-page pthread_mutex in a random - * page of the area_dst (while the physical page may still be in - * area_src), and increments a per-page counter in the same page, - * and checks its value against a verification region. - * - * 2) another per-CPU thread handles the userfaults generated by - * thread 1 above. userfaultfd blocking reads or poll() modes are - * exercised interleaved. - * - * 3) one last per-CPU thread transfers the memory in the background - * at maximum bandwidth (if not already transferred by thread - * 2). Each cpu thread takes cares of transferring a portion of the - * area. - * - * When all threads of type 3 completed the transfer, one bounce is - * complete. area_src and area_dst are then swapped. All threads are - * respawned and so the bounce is immediately restarted in the - * opposite direction. - * - * per-CPU threads 1 by triggering userfaults inside - * pthread_mutex_lock will also verify the atomicity of the memory - * transfer (UFFDIO_COPY). - */ - -#define _GNU_SOURCE -#include <stdio.h> -#include <errno.h> -#include <unistd.h> -#include <stdlib.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <fcntl.h> -#include <time.h> -#include <signal.h> -#include <poll.h> -#include <string.h> -#include <linux/mman.h> -#include <sys/mman.h> -#include <sys/syscall.h> -#include <sys/ioctl.h> -#include <sys/wait.h> -#include <pthread.h> -#include <linux/userfaultfd.h> -#include <setjmp.h> -#include <stdbool.h> -#include <assert.h> -#include <inttypes.h> -#include <stdint.h> -#include <sys/random.h> - -#include "../kselftest.h" -#include "vm_util.h" - -#ifdef __NR_userfaultfd - -static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; - -#define BOUNCE_RANDOM (1<<0) -#define BOUNCE_RACINGFAULTS (1<<1) -#define BOUNCE_VERIFY (1<<2) -#define BOUNCE_POLL (1<<3) -static int bounces; - -#define TEST_ANON 1 -#define TEST_HUGETLB 2 -#define TEST_SHMEM 3 -static int test_type; - -#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY) - -#define BASE_PMD_ADDR ((void *)(1UL << 30)) - -/* test using /dev/userfaultfd, instead of userfaultfd(2) */ -static bool test_dev_userfaultfd; - -/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ -#define ALARM_INTERVAL_SECS 10 -static volatile bool test_uffdio_copy_eexist = true; -static volatile bool test_uffdio_zeropage_eexist = true; -/* Whether to test uffd write-protection */ -static bool test_uffdio_wp = true; -/* Whether to test uffd minor faults */ -static bool test_uffdio_minor = false; -static bool map_shared; -static int mem_fd; -static unsigned long long *count_verify; -static int uffd = -1; -static int uffd_flags, finished, *pipefd; -static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; -static char *zeropage; -pthread_attr_t attr; -static bool test_collapse; - -/* Userfaultfd test statistics */ -struct uffd_stats { - int cpu; - unsigned long missing_faults; - unsigned long wp_faults; - unsigned long minor_faults; -}; - -/* pthread_mutex_t starts at page offset 0 */ -#define area_mutex(___area, ___nr) \ - ((pthread_mutex_t *) ((___area) + (___nr)*page_size)) -/* - * count is placed in the page after pthread_mutex_t naturally aligned - * to avoid non alignment faults on non-x86 archs. - */ -#define area_count(___area, ___nr) \ - ((volatile unsigned long long *) ((unsigned long) \ - ((___area) + (___nr)*page_size + \ - sizeof(pthread_mutex_t) + \ - sizeof(unsigned long long) - 1) & \ - ~(unsigned long)(sizeof(unsigned long long) \ - - 1))) - -#define swap(a, b) \ - do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) - -#define factor_of_2(x) ((x) ^ ((x) & ((x) - 1))) - -const char *examples = - "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" - "./userfaultfd anon 100 99999\n\n" - "# Run the same anonymous memory test, but using /dev/userfaultfd:\n" - "./userfaultfd anon:dev 100 99999\n\n" - "# Run share memory test on 1GiB region with 99 bounces:\n" - "./userfaultfd shmem 1000 99\n\n" - "# Run hugetlb memory test on 256MiB region with 50 bounces:\n" - "./userfaultfd hugetlb 256 50\n\n" - "# Run the same hugetlb test but using shared file:\n" - "./userfaultfd hugetlb_shared 256 50\n\n" - "# 10MiB-~6GiB 999 bounces anonymous test, " - "continue forever unless an error triggers\n" - "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n"; - -static void usage(void) -{ - fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> " - "[hugetlbfs_file]\n\n"); - fprintf(stderr, "Supported <test type>: anon, hugetlb, " - "hugetlb_shared, shmem\n\n"); - fprintf(stderr, "'Test mods' can be joined to the test type string with a ':'. " - "Supported mods:\n"); - fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n"); - fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n"); - fprintf(stderr, "\tcollapse - Test MADV_COLLAPSE of UFFDIO_REGISTER_MODE_MINOR\n" - "memory\n"); - fprintf(stderr, "\nExample test mod usage:\n"); - fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n"); - fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n"); - - fprintf(stderr, "Examples:\n\n"); - fprintf(stderr, "%s", examples); - exit(1); -} - -#define _err(fmt, ...) \ - do { \ - int ret = errno; \ - fprintf(stderr, "ERROR: " fmt, ##__VA_ARGS__); \ - fprintf(stderr, " (errno=%d, line=%d)\n", \ - ret, __LINE__); \ - } while (0) - -#define errexit(exitcode, fmt, ...) \ - do { \ - _err(fmt, ##__VA_ARGS__); \ - exit(exitcode); \ - } while (0) - -#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__) - -static void uffd_stats_reset(struct uffd_stats *uffd_stats, - unsigned long n_cpus) -{ - int i; - - for (i = 0; i < n_cpus; i++) { - uffd_stats[i].cpu = i; - uffd_stats[i].missing_faults = 0; - uffd_stats[i].wp_faults = 0; - uffd_stats[i].minor_faults = 0; - } -} - -static void uffd_stats_report(struct uffd_stats *stats, int n_cpus) -{ - int i; - unsigned long long miss_total = 0, wp_total = 0, minor_total = 0; - - for (i = 0; i < n_cpus; i++) { - miss_total += stats[i].missing_faults; - wp_total += stats[i].wp_faults; - minor_total += stats[i].minor_faults; - } - - printf("userfaults: "); - if (miss_total) { - printf("%llu missing (", miss_total); - for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].missing_faults); - printf("\b) "); - } - if (wp_total) { - printf("%llu wp (", wp_total); - for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].wp_faults); - printf("\b) "); - } - if (minor_total) { - printf("%llu minor (", minor_total); - for (i = 0; i < n_cpus; i++) - printf("%lu+", stats[i].minor_faults); - printf("\b)"); - } - printf("\n"); -} - -static void anon_release_pages(char *rel_area) -{ - if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) - err("madvise(MADV_DONTNEED) failed"); -} - -static void anon_allocate_area(void **alloc_area, bool is_src) -{ - *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); -} - -static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) -{ -} - -static void hugetlb_release_pages(char *rel_area) -{ - if (!map_shared) { - if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) - err("madvise(MADV_DONTNEED) failed"); - } else { - if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) - err("madvise(MADV_REMOVE) failed"); - } -} - -static void hugetlb_allocate_area(void **alloc_area, bool is_src) -{ - off_t size = nr_pages * page_size; - off_t offset = is_src ? 0 : size; - void *area_alias = NULL; - char **alloc_area_alias; - - *alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE, - (map_shared ? MAP_SHARED : MAP_PRIVATE) | - (is_src ? 0 : MAP_NORESERVE), - mem_fd, offset); - if (*alloc_area == MAP_FAILED) - err("mmap of hugetlbfs file failed"); - - if (map_shared) { - area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE, - MAP_SHARED, mem_fd, offset); - if (area_alias == MAP_FAILED) - err("mmap of hugetlb file alias failed"); - } - - if (is_src) { - alloc_area_alias = &area_src_alias; - } else { - alloc_area_alias = &area_dst_alias; - } - if (area_alias) - *alloc_area_alias = area_alias; -} - -static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset) -{ - if (!map_shared) - return; - - *start = (unsigned long) area_dst_alias + offset; -} - -static void shmem_release_pages(char *rel_area) -{ - if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) - err("madvise(MADV_REMOVE) failed"); -} - -static void shmem_allocate_area(void **alloc_area, bool is_src) -{ - void *area_alias = NULL; - size_t bytes = nr_pages * page_size; - unsigned long offset = is_src ? 0 : bytes; - char *p = NULL, *p_alias = NULL; - - if (test_collapse) { - p = BASE_PMD_ADDR; - if (!is_src) - /* src map + alias + interleaved hpages */ - p += 2 * (bytes + hpage_size); - p_alias = p; - p_alias += bytes; - p_alias += hpage_size; /* Prevent src/dst VMA merge */ - } - - *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, - mem_fd, offset); - if (*alloc_area == MAP_FAILED) - err("mmap of memfd failed"); - if (test_collapse && *alloc_area != p) - err("mmap of memfd failed at %p", p); - - area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, - mem_fd, offset); - if (area_alias == MAP_FAILED) - err("mmap of memfd alias failed"); - if (test_collapse && area_alias != p_alias) - err("mmap of anonymous memory failed at %p", p_alias); - - if (is_src) - area_src_alias = area_alias; - else - area_dst_alias = area_alias; -} - -static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) -{ - *start = (unsigned long)area_dst_alias + offset; -} - -static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) -{ - if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size)) - err("Did not find expected %d number of hugepages", - expect_nr_hpages); -} - -struct uffd_test_ops { - void (*allocate_area)(void **alloc_area, bool is_src); - void (*release_pages)(char *rel_area); - void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); - void (*check_pmd_mapping)(void *p, int expect_nr_hpages); -}; - -static struct uffd_test_ops anon_uffd_test_ops = { - .allocate_area = anon_allocate_area, - .release_pages = anon_release_pages, - .alias_mapping = noop_alias_mapping, - .check_pmd_mapping = NULL, -}; - -static struct uffd_test_ops shmem_uffd_test_ops = { - .allocate_area = shmem_allocate_area, - .release_pages = shmem_release_pages, - .alias_mapping = shmem_alias_mapping, - .check_pmd_mapping = shmem_check_pmd_mapping, -}; - -static struct uffd_test_ops hugetlb_uffd_test_ops = { - .allocate_area = hugetlb_allocate_area, - .release_pages = hugetlb_release_pages, - .alias_mapping = hugetlb_alias_mapping, - .check_pmd_mapping = NULL, -}; - -static struct uffd_test_ops *uffd_test_ops; - -static inline uint64_t uffd_minor_feature(void) -{ - if (test_type == TEST_HUGETLB && map_shared) - return UFFD_FEATURE_MINOR_HUGETLBFS; - else if (test_type == TEST_SHMEM) - return UFFD_FEATURE_MINOR_SHMEM; - else - return 0; -} - -static uint64_t get_expected_ioctls(uint64_t mode) -{ - uint64_t ioctls = UFFD_API_RANGE_IOCTLS; - - if (test_type == TEST_HUGETLB) - ioctls &= ~(1 << _UFFDIO_ZEROPAGE); - - if (!((mode & UFFDIO_REGISTER_MODE_WP) && test_uffdio_wp)) - ioctls &= ~(1 << _UFFDIO_WRITEPROTECT); - - if (!((mode & UFFDIO_REGISTER_MODE_MINOR) && test_uffdio_minor)) - ioctls &= ~(1 << _UFFDIO_CONTINUE); - - return ioctls; -} - -static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls) -{ - uint64_t expected = get_expected_ioctls(mode); - uint64_t actual = ioctls & expected; - - if (actual != expected) { - err("missing ioctl(s): expected %"PRIx64" actual: %"PRIx64, - expected, actual); - } -} - -static int __userfaultfd_open_dev(void) -{ - int fd, _uffd; - - fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); - if (fd < 0) - errexit(KSFT_SKIP, "opening /dev/userfaultfd failed"); - - _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS); - if (_uffd < 0) - errexit(errno == ENOTTY ? KSFT_SKIP : 1, - "creating userfaultfd failed"); - close(fd); - return _uffd; -} - -static void userfaultfd_open(uint64_t *features) -{ - struct uffdio_api uffdio_api; - - if (test_dev_userfaultfd) - uffd = __userfaultfd_open_dev(); - else { - uffd = syscall(__NR_userfaultfd, UFFD_FLAGS); - if (uffd < 0) - errexit(errno == ENOSYS ? KSFT_SKIP : 1, - "creating userfaultfd failed"); - } - uffd_flags = fcntl(uffd, F_GETFD, NULL); - - uffdio_api.api = UFFD_API; - uffdio_api.features = *features; - if (ioctl(uffd, UFFDIO_API, &uffdio_api)) - err("UFFDIO_API failed.\nPlease make sure to " - "run with either root or ptrace capability."); - if (uffdio_api.api != UFFD_API) - err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api); - - *features = uffdio_api.features; -} - -static inline void munmap_area(void **area) -{ - if (*area) - if (munmap(*area, nr_pages * page_size)) - err("munmap"); - - *area = NULL; -} - -static void uffd_test_ctx_clear(void) -{ - size_t i; - - if (pipefd) { - for (i = 0; i < nr_cpus * 2; ++i) { - if (close(pipefd[i])) - err("close pipefd"); - } - free(pipefd); - pipefd = NULL; - } - - if (count_verify) { - free(count_verify); - count_verify = NULL; - } - - if (uffd != -1) { - if (close(uffd)) - err("close uffd"); - uffd = -1; - } - - munmap_area((void **)&area_src); - munmap_area((void **)&area_src_alias); - munmap_area((void **)&area_dst); - munmap_area((void **)&area_dst_alias); - munmap_area((void **)&area_remap); -} - -static void uffd_test_ctx_init(uint64_t features) -{ - unsigned long nr, cpu; - - uffd_test_ctx_clear(); - - uffd_test_ops->allocate_area((void **)&area_src, true); - uffd_test_ops->allocate_area((void **)&area_dst, false); - - userfaultfd_open(&features); - - count_verify = malloc(nr_pages * sizeof(unsigned long long)); - if (!count_verify) - err("count_verify"); - - for (nr = 0; nr < nr_pages; nr++) { - *area_mutex(area_src, nr) = - (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER; - count_verify[nr] = *area_count(area_src, nr) = 1; - /* - * In the transition between 255 to 256, powerpc will - * read out of order in my_bcmp and see both bytes as - * zero, so leave a placeholder below always non-zero - * after the count, to avoid my_bcmp to trigger false - * positives. - */ - *(area_count(area_src, nr) + 1) = 1; - } - - /* - * After initialization of area_src, we must explicitly release pages - * for area_dst to make sure it's fully empty. Otherwise we could have - * some area_dst pages be errornously initialized with zero pages, - * hence we could hit memory corruption later in the test. - * - * One example is when THP is globally enabled, above allocate_area() - * calls could have the two areas merged into a single VMA (as they - * will have the same VMA flags so they're mergeable). When we - * initialize the area_src above, it's possible that some part of - * area_dst could have been faulted in via one huge THP that will be - * shared between area_src and area_dst. It could cause some of the - * area_dst won't be trapped by missing userfaults. - * - * This release_pages() will guarantee even if that happened, we'll - * proactively split the thp and drop any accidentally initialized - * pages within area_dst. - */ - uffd_test_ops->release_pages(area_dst); - - pipefd = malloc(sizeof(int) * nr_cpus * 2); - if (!pipefd) - err("pipefd"); - for (cpu = 0; cpu < nr_cpus; cpu++) - if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK)) - err("pipe"); -} - -static int my_bcmp(char *str1, char *str2, size_t n) -{ - unsigned long i; - for (i = 0; i < n; i++) - if (str1[i] != str2[i]) - return 1; - return 0; -} - -static void wp_range(int ufd, __u64 start, __u64 len, bool wp) -{ - struct uffdio_writeprotect prms; - - /* Write protection page faults */ - prms.range.start = start; - prms.range.len = len; - /* Undo write-protect, do wakeup after that */ - prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0; - - if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) - err("clear WP failed: address=0x%"PRIx64, (uint64_t)start); -} - -static void continue_range(int ufd, __u64 start, __u64 len) -{ - struct uffdio_continue req; - int ret; - - req.range.start = start; - req.range.len = len; - req.mode = 0; - - if (ioctl(ufd, UFFDIO_CONTINUE, &req)) - err("UFFDIO_CONTINUE failed for address 0x%" PRIx64, - (uint64_t)start); - - /* - * Error handling within the kernel for continue is subtly different - * from copy or zeropage, so it may be a source of bugs. Trigger an - * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG. - */ - req.mapped = 0; - ret = ioctl(ufd, UFFDIO_CONTINUE, &req); - if (ret >= 0 || req.mapped != -EEXIST) - err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64, - ret, (int64_t) req.mapped); -} - -static void *locking_thread(void *arg) -{ - unsigned long cpu = (unsigned long) arg; - unsigned long page_nr; - unsigned long long count; - - if (!(bounces & BOUNCE_RANDOM)) { - page_nr = -bounces; - if (!(bounces & BOUNCE_RACINGFAULTS)) - page_nr += cpu * nr_pages_per_cpu; - } - - while (!finished) { - if (bounces & BOUNCE_RANDOM) { - if (getrandom(&page_nr, sizeof(page_nr), 0) != sizeof(page_nr)) - err("getrandom failed"); - } else - page_nr += 1; - page_nr %= nr_pages; - pthread_mutex_lock(area_mutex(area_dst, page_nr)); - count = *area_count(area_dst, page_nr); - if (count != count_verify[page_nr]) - err("page_nr %lu memory corruption %llu %llu", - page_nr, count, count_verify[page_nr]); - count++; - *area_count(area_dst, page_nr) = count_verify[page_nr] = count; - pthread_mutex_unlock(area_mutex(area_dst, page_nr)); - } - - return NULL; -} - -static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy, - unsigned long offset) -{ - uffd_test_ops->alias_mapping(&uffdio_copy->dst, - uffdio_copy->len, - offset); - if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) { - /* real retval in ufdio_copy.copy */ - if (uffdio_copy->copy != -EEXIST) - err("UFFDIO_COPY retry error: %"PRId64, - (int64_t)uffdio_copy->copy); - } else { - err("UFFDIO_COPY retry unexpected: %"PRId64, - (int64_t)uffdio_copy->copy); - } -} - -static void wake_range(int ufd, unsigned long addr, unsigned long len) -{ - struct uffdio_range uffdio_wake; - - uffdio_wake.start = addr; - uffdio_wake.len = len; - - if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake)) - fprintf(stderr, "error waking %lu\n", - addr), exit(1); -} - -static int __copy_page(int ufd, unsigned long offset, bool retry) -{ - struct uffdio_copy uffdio_copy; - - if (offset >= nr_pages * page_size) - err("unexpected offset %lu\n", offset); - uffdio_copy.dst = (unsigned long) area_dst + offset; - uffdio_copy.src = (unsigned long) area_src + offset; - uffdio_copy.len = page_size; - if (test_uffdio_wp) - uffdio_copy.mode = UFFDIO_COPY_MODE_WP; - else - uffdio_copy.mode = 0; - uffdio_copy.copy = 0; - if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) { - /* real retval in ufdio_copy.copy */ - if (uffdio_copy.copy != -EEXIST) - err("UFFDIO_COPY error: %"PRId64, - (int64_t)uffdio_copy.copy); - wake_range(ufd, uffdio_copy.dst, page_size); - } else if (uffdio_copy.copy != page_size) { - err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy); - } else { - if (test_uffdio_copy_eexist && retry) { - test_uffdio_copy_eexist = false; - retry_copy_page(ufd, &uffdio_copy, offset); - } - return 1; - } - return 0; -} - -static int copy_page_retry(int ufd, unsigned long offset) -{ - return __copy_page(ufd, offset, true); -} - -static int copy_page(int ufd, unsigned long offset) -{ - return __copy_page(ufd, offset, false); -} - -static int uffd_read_msg(int ufd, struct uffd_msg *msg) -{ - int ret = read(uffd, msg, sizeof(*msg)); - - if (ret != sizeof(*msg)) { - if (ret < 0) { - if (errno == EAGAIN || errno == EINTR) - return 1; - err("blocking read error"); - } else { - err("short read"); - } - } - - return 0; -} - -static void uffd_handle_page_fault(struct uffd_msg *msg, - struct uffd_stats *stats) -{ - unsigned long offset; - - if (msg->event != UFFD_EVENT_PAGEFAULT) - err("unexpected msg event %u", msg->event); - - if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) { - /* Write protect page faults */ - wp_range(uffd, msg->arg.pagefault.address, page_size, false); - stats->wp_faults++; - } else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) { - uint8_t *area; - int b; - - /* - * Minor page faults - * - * To prove we can modify the original range for testing - * purposes, we're going to bit flip this range before - * continuing. - * - * Note that this requires all minor page fault tests operate on - * area_dst (non-UFFD-registered) and area_dst_alias - * (UFFD-registered). - */ - - area = (uint8_t *)(area_dst + - ((char *)msg->arg.pagefault.address - - area_dst_alias)); - for (b = 0; b < page_size; ++b) - area[b] = ~area[b]; - continue_range(uffd, msg->arg.pagefault.address, page_size); - stats->minor_faults++; - } else { - /* - * Missing page faults. - * - * Here we force a write check for each of the missing mode - * faults. It's guaranteed because the only threads that - * will trigger uffd faults are the locking threads, and - * their first instruction to touch the missing page will - * always be pthread_mutex_lock(). - * - * Note that here we relied on an NPTL glibc impl detail to - * always read the lock type at the entry of the lock op - * (pthread_mutex_t.__data.__type, offset 0x10) before - * doing any locking operations to guarantee that. It's - * actually not good to rely on this impl detail because - * logically a pthread-compatible lib can implement the - * locks without types and we can fail when linking with - * them. However since we used to find bugs with this - * strict check we still keep it around. Hopefully this - * could be a good hint when it fails again. If one day - * it'll break on some other impl of glibc we'll revisit. - */ - if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) - err("unexpected write fault"); - - offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst; - offset &= ~(page_size-1); - - if (copy_page(uffd, offset)) - stats->missing_faults++; - } -} - -static void *uffd_poll_thread(void *arg) -{ - struct uffd_stats *stats = (struct uffd_stats *)arg; - unsigned long cpu = stats->cpu; - struct pollfd pollfd[2]; - struct uffd_msg msg; - struct uffdio_register uffd_reg; - int ret; - char tmp_chr; - - pollfd[0].fd = uffd; - pollfd[0].events = POLLIN; - pollfd[1].fd = pipefd[cpu*2]; - pollfd[1].events = POLLIN; - - for (;;) { - ret = poll(pollfd, 2, -1); - if (ret <= 0) { - if (errno == EINTR || errno == EAGAIN) - continue; - err("poll error: %d", ret); - } - if (pollfd[1].revents & POLLIN) { - if (read(pollfd[1].fd, &tmp_chr, 1) != 1) - err("read pipefd error"); - break; - } - if (!(pollfd[0].revents & POLLIN)) - err("pollfd[0].revents %d", pollfd[0].revents); - if (uffd_read_msg(uffd, &msg)) - continue; - switch (msg.event) { - default: - err("unexpected msg event %u\n", msg.event); - break; - case UFFD_EVENT_PAGEFAULT: - uffd_handle_page_fault(&msg, stats); - break; - case UFFD_EVENT_FORK: - close(uffd); - uffd = msg.arg.fork.ufd; - pollfd[0].fd = uffd; - break; - case UFFD_EVENT_REMOVE: - uffd_reg.range.start = msg.arg.remove.start; - uffd_reg.range.len = msg.arg.remove.end - - msg.arg.remove.start; - if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) - err("remove failure"); - break; - case UFFD_EVENT_REMAP: - area_remap = area_dst; /* save for later unmap */ - area_dst = (char *)(unsigned long)msg.arg.remap.to; - break; - } - } - - return NULL; -} - -pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER; - -static void *uffd_read_thread(void *arg) -{ - struct uffd_stats *stats = (struct uffd_stats *)arg; - struct uffd_msg msg; - - pthread_mutex_unlock(&uffd_read_mutex); - /* from here cancellation is ok */ - - for (;;) { - if (uffd_read_msg(uffd, &msg)) - continue; - uffd_handle_page_fault(&msg, stats); - } - - return NULL; -} - -static void *background_thread(void *arg) -{ - unsigned long cpu = (unsigned long) arg; - unsigned long page_nr, start_nr, mid_nr, end_nr; - - start_nr = cpu * nr_pages_per_cpu; - end_nr = (cpu+1) * nr_pages_per_cpu; - mid_nr = (start_nr + end_nr) / 2; - - /* Copy the first half of the pages */ - for (page_nr = start_nr; page_nr < mid_nr; page_nr++) - copy_page_retry(uffd, page_nr * page_size); - - /* - * If we need to test uffd-wp, set it up now. Then we'll have - * at least the first half of the pages mapped already which - * can be write-protected for testing - */ - if (test_uffdio_wp) - wp_range(uffd, (unsigned long)area_dst + start_nr * page_size, - nr_pages_per_cpu * page_size, true); - - /* - * Continue the 2nd half of the page copying, handling write - * protection faults if any - */ - for (page_nr = mid_nr; page_nr < end_nr; page_nr++) - copy_page_retry(uffd, page_nr * page_size); - - return NULL; -} - -static int stress(struct uffd_stats *uffd_stats) -{ - unsigned long cpu; - pthread_t locking_threads[nr_cpus]; - pthread_t uffd_threads[nr_cpus]; - pthread_t background_threads[nr_cpus]; - - finished = 0; - for (cpu = 0; cpu < nr_cpus; cpu++) { - if (pthread_create(&locking_threads[cpu], &attr, - locking_thread, (void *)cpu)) - return 1; - if (bounces & BOUNCE_POLL) { - if (pthread_create(&uffd_threads[cpu], &attr, - uffd_poll_thread, - (void *)&uffd_stats[cpu])) - return 1; - } else { - if (pthread_create(&uffd_threads[cpu], &attr, - uffd_read_thread, - (void *)&uffd_stats[cpu])) - return 1; - pthread_mutex_lock(&uffd_read_mutex); - } - if (pthread_create(&background_threads[cpu], &attr, - background_thread, (void *)cpu)) - return 1; - } - for (cpu = 0; cpu < nr_cpus; cpu++) - if (pthread_join(background_threads[cpu], NULL)) - return 1; - - /* - * Be strict and immediately zap area_src, the whole area has - * been transferred already by the background treads. The - * area_src could then be faulted in a racy way by still - * running uffdio_threads reading zeropages after we zapped - * area_src (but they're guaranteed to get -EEXIST from - * UFFDIO_COPY without writing zero pages into area_dst - * because the background threads already completed). - */ - uffd_test_ops->release_pages(area_src); - - finished = 1; - for (cpu = 0; cpu < nr_cpus; cpu++) - if (pthread_join(locking_threads[cpu], NULL)) - return 1; - - for (cpu = 0; cpu < nr_cpus; cpu++) { - char c; - if (bounces & BOUNCE_POLL) { - if (write(pipefd[cpu*2+1], &c, 1) != 1) - err("pipefd write error"); - if (pthread_join(uffd_threads[cpu], - (void *)&uffd_stats[cpu])) - return 1; - } else { - if (pthread_cancel(uffd_threads[cpu])) - return 1; - if (pthread_join(uffd_threads[cpu], NULL)) - return 1; - } - } - - return 0; -} - -sigjmp_buf jbuf, *sigbuf; - -static void sighndl(int sig, siginfo_t *siginfo, void *ptr) -{ - if (sig == SIGBUS) { - if (sigbuf) - siglongjmp(*sigbuf, 1); - abort(); - } -} - -/* - * For non-cooperative userfaultfd test we fork() a process that will - * generate pagefaults, will mremap the area monitored by the - * userfaultfd and at last this process will release the monitored - * area. - * For the anonymous and shared memory the area is divided into two - * parts, the first part is accessed before mremap, and the second - * part is accessed after mremap. Since hugetlbfs does not support - * mremap, the entire monitored area is accessed in a single pass for - * HUGETLB_TEST. - * The release of the pages currently generates event for shmem and - * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked - * for hugetlb. - * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register - * monitored area, generate pagefaults and test that signal is delivered. - * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2 - * test robustness use case - we release monitored area, fork a process - * that will generate pagefaults and verify signal is generated. - * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal - * feature. Using monitor thread, verify no userfault events are generated. - */ -static int faulting_process(int signal_test) -{ - unsigned long nr; - unsigned long long count; - unsigned long split_nr_pages; - unsigned long lastnr; - struct sigaction act; - volatile unsigned long signalled = 0; - - split_nr_pages = (nr_pages + 1) / 2; - - if (signal_test) { - sigbuf = &jbuf; - memset(&act, 0, sizeof(act)); - act.sa_sigaction = sighndl; - act.sa_flags = SA_SIGINFO; - if (sigaction(SIGBUS, &act, 0)) - err("sigaction"); - lastnr = (unsigned long)-1; - } - - for (nr = 0; nr < split_nr_pages; nr++) { - volatile int steps = 1; - unsigned long offset = nr * page_size; - - if (signal_test) { - if (sigsetjmp(*sigbuf, 1) != 0) { - if (steps == 1 && nr == lastnr) - err("Signal repeated"); - - lastnr = nr; - if (signal_test == 1) { - if (steps == 1) { - /* This is a MISSING request */ - steps++; - if (copy_page(uffd, offset)) - signalled++; - } else { - /* This is a WP request */ - assert(steps == 2); - wp_range(uffd, - (__u64)area_dst + - offset, - page_size, false); - } - } else { - signalled++; - continue; - } - } - } - - count = *area_count(area_dst, nr); - if (count != count_verify[nr]) - err("nr %lu memory corruption %llu %llu\n", - nr, count, count_verify[nr]); - /* - * Trigger write protection if there is by writing - * the same value back. - */ - *area_count(area_dst, nr) = count; - } - - if (signal_test) - return signalled != split_nr_pages; - - area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size, - MREMAP_MAYMOVE | MREMAP_FIXED, area_src); - if (area_dst == MAP_FAILED) - err("mremap"); - /* Reset area_src since we just clobbered it */ - area_src = NULL; - - for (; nr < nr_pages; nr++) { - count = *area_count(area_dst, nr); - if (count != count_verify[nr]) { - err("nr %lu memory corruption %llu %llu\n", - nr, count, count_verify[nr]); - } - /* - * Trigger write protection if there is by writing - * the same value back. - */ - *area_count(area_dst, nr) = count; - } - - uffd_test_ops->release_pages(area_dst); - - for (nr = 0; nr < nr_pages; nr++) - if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) - err("nr %lu is not zero", nr); - - return 0; -} - -static void retry_uffdio_zeropage(int ufd, - struct uffdio_zeropage *uffdio_zeropage, - unsigned long offset) -{ - uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start, - uffdio_zeropage->range.len, - offset); - if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) { - if (uffdio_zeropage->zeropage != -EEXIST) - err("UFFDIO_ZEROPAGE error: %"PRId64, - (int64_t)uffdio_zeropage->zeropage); - } else { - err("UFFDIO_ZEROPAGE error: %"PRId64, - (int64_t)uffdio_zeropage->zeropage); - } -} - -static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry) -{ - struct uffdio_zeropage uffdio_zeropage; - int ret; - bool has_zeropage = get_expected_ioctls(0) & (1 << _UFFDIO_ZEROPAGE); - __s64 res; - - if (offset >= nr_pages * page_size) - err("unexpected offset %lu", offset); - uffdio_zeropage.range.start = (unsigned long) area_dst + offset; - uffdio_zeropage.range.len = page_size; - uffdio_zeropage.mode = 0; - ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage); - res = uffdio_zeropage.zeropage; - if (ret) { - /* real retval in ufdio_zeropage.zeropage */ - if (has_zeropage) - err("UFFDIO_ZEROPAGE error: %"PRId64, (int64_t)res); - else if (res != -EINVAL) - err("UFFDIO_ZEROPAGE not -EINVAL"); - } else if (has_zeropage) { - if (res != page_size) { - err("UFFDIO_ZEROPAGE unexpected size"); - } else { - if (test_uffdio_zeropage_eexist && retry) { - test_uffdio_zeropage_eexist = false; - retry_uffdio_zeropage(ufd, &uffdio_zeropage, - offset); - } - return 1; - } - } else - err("UFFDIO_ZEROPAGE succeeded"); - - return 0; -} - -static int uffdio_zeropage(int ufd, unsigned long offset) -{ - return __uffdio_zeropage(ufd, offset, false); -} - -/* exercise UFFDIO_ZEROPAGE */ -static int userfaultfd_zeropage_test(void) -{ - struct uffdio_register uffdio_register; - - printf("testing UFFDIO_ZEROPAGE: "); - fflush(stdout); - - uffd_test_ctx_init(0); - - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - if (uffdio_zeropage(uffd, 0)) - if (my_bcmp(area_dst, zeropage, page_size)) - err("zeropage is not zero"); - - printf("done.\n"); - return 0; -} - -static int userfaultfd_events_test(void) -{ - struct uffdio_register uffdio_register; - pthread_t uffd_mon; - int err, features; - pid_t pid; - char c; - struct uffd_stats stats = { 0 }; - - printf("testing events (fork, remap, remove): "); - fflush(stdout); - - features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP | - UFFD_FEATURE_EVENT_REMOVE; - uffd_test_ctx_init(features); - - fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); - - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) - err("uffd_poll_thread create"); - - pid = fork(); - if (pid < 0) - err("fork"); - - if (!pid) - exit(faulting_process(0)); - - waitpid(pid, &err, 0); - if (err) - err("faulting process failed"); - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) - err("pipe write"); - if (pthread_join(uffd_mon, NULL)) - return 1; - - uffd_stats_report(&stats, 1); - - return stats.missing_faults != nr_pages; -} - -static int userfaultfd_sig_test(void) -{ - struct uffdio_register uffdio_register; - unsigned long userfaults; - pthread_t uffd_mon; - int err, features; - pid_t pid; - char c; - struct uffd_stats stats = { 0 }; - - printf("testing signal delivery: "); - fflush(stdout); - - features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS; - uffd_test_ctx_init(features); - - fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); - - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - if (faulting_process(1)) - err("faulting process failed"); - - uffd_test_ops->release_pages(area_dst); - - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) - err("uffd_poll_thread create"); - - pid = fork(); - if (pid < 0) - err("fork"); - - if (!pid) - exit(faulting_process(2)); - - waitpid(pid, &err, 0); - if (err) - err("faulting process failed"); - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) - err("pipe write"); - if (pthread_join(uffd_mon, (void **)&userfaults)) - return 1; - - printf("done.\n"); - if (userfaults) - err("Signal test failed, userfaults: %ld", userfaults); - - return userfaults != 0; -} - -void check_memory_contents(char *p) -{ - unsigned long i; - uint8_t expected_byte; - void *expected_page; - - if (posix_memalign(&expected_page, page_size, page_size)) - err("out of memory"); - - for (i = 0; i < nr_pages; ++i) { - expected_byte = ~((uint8_t)(i % ((uint8_t)-1))); - memset(expected_page, expected_byte, page_size); - if (my_bcmp(expected_page, p + (i * page_size), page_size)) - err("unexpected page contents after minor fault"); - } - - free(expected_page); -} - -static int userfaultfd_minor_test(void) -{ - unsigned long p; - struct uffdio_register uffdio_register; - pthread_t uffd_mon; - char c; - struct uffd_stats stats = { 0 }; - - if (!test_uffdio_minor) - return 0; - - printf("testing minor faults: "); - fflush(stdout); - - uffd_test_ctx_init(uffd_minor_feature()); - - uffdio_register.range.start = (unsigned long)area_dst_alias; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - /* - * After registering with UFFD, populate the non-UFFD-registered side of - * the shared mapping. This should *not* trigger any UFFD minor faults. - */ - for (p = 0; p < nr_pages; ++p) { - memset(area_dst + (p * page_size), p % ((uint8_t)-1), - page_size); - } - - if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) - err("uffd_poll_thread create"); - - /* - * Read each of the pages back using the UFFD-registered mapping. We - * expect that the first time we touch a page, it will result in a minor - * fault. uffd_poll_thread will resolve the fault by bit-flipping the - * page's contents, and then issuing a CONTINUE ioctl. - */ - check_memory_contents(area_dst_alias); - - if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) - err("pipe write"); - if (pthread_join(uffd_mon, NULL)) - return 1; - - uffd_stats_report(&stats, 1); - - if (test_collapse) { - printf("testing collapse of uffd memory into PMD-mapped THPs:"); - if (madvise(area_dst_alias, nr_pages * page_size, - MADV_COLLAPSE)) - err("madvise(MADV_COLLAPSE)"); - - uffd_test_ops->check_pmd_mapping(area_dst, - nr_pages * page_size / - hpage_size); - /* - * This won't cause uffd-fault - it purely just makes sure there - * was no corruption. - */ - check_memory_contents(area_dst_alias); - printf(" done.\n"); - } - - return stats.missing_faults != 0 || stats.minor_faults != nr_pages; -} - -#define BIT_ULL(nr) (1ULL << (nr)) -#define PM_SOFT_DIRTY BIT_ULL(55) -#define PM_MMAP_EXCLUSIVE BIT_ULL(56) -#define PM_UFFD_WP BIT_ULL(57) -#define PM_FILE BIT_ULL(61) -#define PM_SWAP BIT_ULL(62) -#define PM_PRESENT BIT_ULL(63) - -static int pagemap_open(void) -{ - int fd = open("/proc/self/pagemap", O_RDONLY); - - if (fd < 0) - err("open pagemap"); - - return fd; -} - -static uint64_t pagemap_read_vaddr(int fd, void *vaddr) -{ - uint64_t value; - int ret; - - ret = pread(fd, &value, sizeof(uint64_t), - ((uint64_t)vaddr >> 12) * sizeof(uint64_t)); - if (ret != sizeof(uint64_t)) - err("pread() on pagemap failed"); - - return value; -} - -/* This macro let __LINE__ works in err() */ -#define pagemap_check_wp(value, wp) do { \ - if (!!(value & PM_UFFD_WP) != wp) \ - err("pagemap uffd-wp bit error: 0x%"PRIx64, value); \ - } while (0) - -static int pagemap_test_fork(bool present) -{ - pid_t child = fork(); - uint64_t value; - int fd, result; - - if (!child) { - /* Open the pagemap fd of the child itself */ - fd = pagemap_open(); - value = pagemap_read_vaddr(fd, area_dst); - /* - * After fork() uffd-wp bit should be gone as long as we're - * without UFFD_FEATURE_EVENT_FORK - */ - pagemap_check_wp(value, false); - /* Succeed */ - exit(0); - } - waitpid(child, &result, 0); - return result; -} - -static void userfaultfd_pagemap_test(unsigned int test_pgsize) -{ - struct uffdio_register uffdio_register; - int pagemap_fd; - uint64_t value; - - /* Pagemap tests uffd-wp only */ - if (!test_uffdio_wp) - return; - - /* Not enough memory to test this page size */ - if (test_pgsize > nr_pages * page_size) - return; - - printf("testing uffd-wp with pagemap (pgsize=%u): ", test_pgsize); - /* Flush so it doesn't flush twice in parent/child later */ - fflush(stdout); - - uffd_test_ctx_init(0); - - if (test_pgsize > page_size) { - /* This is a thp test */ - if (madvise(area_dst, nr_pages * page_size, MADV_HUGEPAGE)) - err("madvise(MADV_HUGEPAGE) failed"); - } else if (test_pgsize == page_size) { - /* This is normal page test; force no thp */ - if (madvise(area_dst, nr_pages * page_size, MADV_NOHUGEPAGE)) - err("madvise(MADV_NOHUGEPAGE) failed"); - } - - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failed"); - - pagemap_fd = pagemap_open(); - - /* Touch the page */ - *area_dst = 1; - wp_range(uffd, (uint64_t)area_dst, test_pgsize, true); - value = pagemap_read_vaddr(pagemap_fd, area_dst); - pagemap_check_wp(value, true); - /* Make sure uffd-wp bit dropped when fork */ - if (pagemap_test_fork(true)) - err("Detected stall uffd-wp bit in child"); - - /* Exclusive required or PAGEOUT won't work */ - if (!(value & PM_MMAP_EXCLUSIVE)) - err("multiple mapping detected: 0x%"PRIx64, value); - - if (madvise(area_dst, test_pgsize, MADV_PAGEOUT)) - err("madvise(MADV_PAGEOUT) failed"); - - /* Uffd-wp should persist even swapped out */ - value = pagemap_read_vaddr(pagemap_fd, area_dst); - pagemap_check_wp(value, true); - /* Make sure uffd-wp bit dropped when fork */ - if (pagemap_test_fork(false)) - err("Detected stall uffd-wp bit in child"); - - /* Unprotect; this tests swap pte modifications */ - wp_range(uffd, (uint64_t)area_dst, page_size, false); - value = pagemap_read_vaddr(pagemap_fd, area_dst); - pagemap_check_wp(value, false); - - /* Fault in the page from disk */ - *area_dst = 2; - value = pagemap_read_vaddr(pagemap_fd, area_dst); - pagemap_check_wp(value, false); - - close(pagemap_fd); - printf("done\n"); -} - -static int userfaultfd_stress(void) -{ - void *area; - unsigned long nr; - struct uffdio_register uffdio_register; - struct uffd_stats uffd_stats[nr_cpus]; - - uffd_test_ctx_init(0); - - if (posix_memalign(&area, page_size, page_size)) - err("out of memory"); - zeropage = area; - bzero(zeropage, page_size); - - pthread_mutex_lock(&uffd_read_mutex); - - pthread_attr_init(&attr); - pthread_attr_setstacksize(&attr, 16*1024*1024); - - while (bounces--) { - printf("bounces: %d, mode:", bounces); - if (bounces & BOUNCE_RANDOM) - printf(" rnd"); - if (bounces & BOUNCE_RACINGFAULTS) - printf(" racing"); - if (bounces & BOUNCE_VERIFY) - printf(" ver"); - if (bounces & BOUNCE_POLL) - printf(" poll"); - else - printf(" read"); - printf(", "); - fflush(stdout); - - if (bounces & BOUNCE_POLL) - fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK); - else - fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK); - - /* register */ - uffdio_register.range.start = (unsigned long) area_dst; - uffdio_register.range.len = nr_pages * page_size; - uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; - if (test_uffdio_wp) - uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure"); - assert_expected_ioctls_present( - uffdio_register.mode, uffdio_register.ioctls); - - if (area_dst_alias) { - uffdio_register.range.start = (unsigned long) - area_dst_alias; - if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) - err("register failure alias"); - } - - /* - * The madvise done previously isn't enough: some - * uffd_thread could have read userfaults (one of - * those already resolved by the background thread) - * and it may be in the process of calling - * UFFDIO_COPY. UFFDIO_COPY will read the zapped - * area_src and it would map a zero page in it (of - * course such a UFFDIO_COPY is perfectly safe as it'd - * return -EEXIST). The problem comes at the next - * bounce though: that racing UFFDIO_COPY would - * generate zeropages in the area_src, so invalidating - * the previous MADV_DONTNEED. Without this additional - * MADV_DONTNEED those zeropages leftovers in the - * area_src would lead to -EEXIST failure during the - * next bounce, effectively leaving a zeropage in the - * area_dst. - * - * Try to comment this out madvise to see the memory - * corruption being caught pretty quick. - * - * khugepaged is also inhibited to collapse THP after - * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's - * required to MADV_DONTNEED here. - */ - uffd_test_ops->release_pages(area_dst); - - uffd_stats_reset(uffd_stats, nr_cpus); - - /* bounce pass */ - if (stress(uffd_stats)) - return 1; - - /* Clear all the write protections if there is any */ - if (test_uffdio_wp) - wp_range(uffd, (unsigned long)area_dst, - nr_pages * page_size, false); - - /* unregister */ - if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) - err("unregister failure"); - if (area_dst_alias) { - uffdio_register.range.start = (unsigned long) area_dst; - if (ioctl(uffd, UFFDIO_UNREGISTER, - &uffdio_register.range)) - err("unregister failure alias"); - } - - /* verification */ - if (bounces & BOUNCE_VERIFY) - for (nr = 0; nr < nr_pages; nr++) - if (*area_count(area_dst, nr) != count_verify[nr]) - err("error area_count %llu %llu %lu\n", - *area_count(area_src, nr), - count_verify[nr], nr); - - /* prepare next bounce */ - swap(area_src, area_dst); - - swap(area_src_alias, area_dst_alias); - - uffd_stats_report(uffd_stats, nr_cpus); - } - - if (test_type == TEST_ANON) { - /* - * shmem/hugetlb won't be able to run since they have different - * behavior on fork() (file-backed memory normally drops ptes - * directly when fork), meanwhile the pagemap test will verify - * pgtable entry of fork()ed child. - */ - userfaultfd_pagemap_test(page_size); - /* - * Hard-code for x86_64 for now for 2M THP, as x86_64 is - * currently the only one that supports uffd-wp - */ - userfaultfd_pagemap_test(page_size * 512); - } - - return userfaultfd_zeropage_test() || userfaultfd_sig_test() - || userfaultfd_events_test() || userfaultfd_minor_test(); -} - -/* - * Copied from mlock2-tests.c - */ -unsigned long default_huge_page_size(void) -{ - unsigned long hps = 0; - char *line = NULL; - size_t linelen = 0; - FILE *f = fopen("/proc/meminfo", "r"); - - if (!f) - return 0; - while (getline(&line, &linelen, f) > 0) { - if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { - hps <<= 10; - break; - } - } - - free(line); - fclose(f); - return hps; -} - -static void set_test_type(const char *type) -{ - if (!strcmp(type, "anon")) { - test_type = TEST_ANON; - uffd_test_ops = &anon_uffd_test_ops; - } else if (!strcmp(type, "hugetlb")) { - test_type = TEST_HUGETLB; - uffd_test_ops = &hugetlb_uffd_test_ops; - } else if (!strcmp(type, "hugetlb_shared")) { - map_shared = true; - test_type = TEST_HUGETLB; - uffd_test_ops = &hugetlb_uffd_test_ops; - /* Minor faults require shared hugetlb; only enable here. */ - test_uffdio_minor = true; - } else if (!strcmp(type, "shmem")) { - map_shared = true; - test_type = TEST_SHMEM; - uffd_test_ops = &shmem_uffd_test_ops; - test_uffdio_minor = true; - } -} - -static void parse_test_type_arg(const char *raw_type) -{ - char *buf = strdup(raw_type); - uint64_t features = UFFD_API_FEATURES; - - while (buf) { - const char *token = strsep(&buf, ":"); - - if (!test_type) - set_test_type(token); - else if (!strcmp(token, "dev")) - test_dev_userfaultfd = true; - else if (!strcmp(token, "syscall")) - test_dev_userfaultfd = false; - else if (!strcmp(token, "collapse")) - test_collapse = true; - else - err("unrecognized test mod '%s'", token); - } - - if (!test_type) - err("failed to parse test type argument: '%s'", raw_type); - - if (test_collapse && test_type != TEST_SHMEM) - err("Unsupported test: %s", raw_type); - - if (test_type == TEST_HUGETLB) - page_size = hpage_size; - else - page_size = sysconf(_SC_PAGE_SIZE); - - if (!page_size) - err("Unable to determine page size"); - if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2 - > page_size) - err("Impossible to run this test"); - - /* - * Whether we can test certain features depends not just on test type, - * but also on whether or not this particular kernel supports the - * feature. - */ - - userfaultfd_open(&features); - - test_uffdio_wp = test_uffdio_wp && - (features & UFFD_FEATURE_PAGEFAULT_FLAG_WP); - test_uffdio_minor = test_uffdio_minor && - (features & uffd_minor_feature()); - - close(uffd); - uffd = -1; -} - -static void sigalrm(int sig) -{ - if (sig != SIGALRM) - abort(); - test_uffdio_copy_eexist = true; - test_uffdio_zeropage_eexist = true; - alarm(ALARM_INTERVAL_SECS); -} - -int main(int argc, char **argv) -{ - size_t bytes; - - if (argc < 4) - usage(); - - if (signal(SIGALRM, sigalrm) == SIG_ERR) - err("failed to arm SIGALRM"); - alarm(ALARM_INTERVAL_SECS); - - hpage_size = default_huge_page_size(); - parse_test_type_arg(argv[1]); - bytes = atol(argv[2]) * 1024 * 1024; - - if (test_collapse && bytes & (hpage_size - 1)) - err("MiB must be multiple of %lu if :collapse mod set", - hpage_size >> 20); - - nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); - - if (test_collapse) { - /* nr_cpus must divide (bytes / page_size), otherwise, - * area allocations of (nr_pages * paze_size) won't be a - * multiple of hpage_size, even if bytes is a multiple of - * hpage_size. - * - * This means that nr_cpus must divide (N * (2 << (H-P)) - * where: - * bytes = hpage_size * N - * hpage_size = 2 << H - * page_size = 2 << P - * - * And we want to chose nr_cpus to be the largest value - * satisfying this constraint, not larger than the number - * of online CPUs. Unfortunately, prime factorization of - * N and nr_cpus may be arbitrary, so have to search for it. - * Instead, just use the highest power of 2 dividing both - * nr_cpus and (bytes / page_size). - */ - int x = factor_of_2(nr_cpus); - int y = factor_of_2(bytes / page_size); - - nr_cpus = x < y ? x : y; - } - nr_pages_per_cpu = bytes / page_size / nr_cpus; - if (!nr_pages_per_cpu) { - _err("invalid MiB"); - usage(); - } - - bounces = atoi(argv[3]); - if (bounces <= 0) { - _err("invalid bounces"); - usage(); - } - nr_pages = nr_pages_per_cpu * nr_cpus; - - if (test_type == TEST_SHMEM || test_type == TEST_HUGETLB) { - unsigned int memfd_flags = 0; - - if (test_type == TEST_HUGETLB) - memfd_flags = MFD_HUGETLB; - mem_fd = memfd_create(argv[0], memfd_flags); - if (mem_fd < 0) - err("memfd_create"); - if (ftruncate(mem_fd, nr_pages * page_size * 2)) - err("ftruncate"); - if (fallocate(mem_fd, - FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, - nr_pages * page_size * 2)) - err("fallocate"); - } - printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n", - nr_pages, nr_pages_per_cpu); - return userfaultfd_stress(); -} - -#else /* __NR_userfaultfd */ - -#warning "missing __NR_userfaultfd definition" - -int main(void) -{ - printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n"); - return KSFT_SKIP; -} - -#endif /* __NR_userfaultfd */ --- a/tools/testing/selftests/vm/util.h +++ /dev/null @@ -1,69 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ - -#ifndef __KSELFTEST_VM_UTIL_H -#define __KSELFTEST_VM_UTIL_H - -#include <stdint.h> -#include <sys/mman.h> -#include <err.h> -#include <string.h> /* ffsl() */ -#include <unistd.h> /* _SC_PAGESIZE */ - -static unsigned int __page_size; -static unsigned int __page_shift; - -static inline unsigned int page_size(void) -{ - if (!__page_size) - __page_size = sysconf(_SC_PAGESIZE); - return __page_size; -} - -static inline unsigned int page_shift(void) -{ - if (!__page_shift) - __page_shift = (ffsl(page_size()) - 1); - return __page_shift; -} - -#define PAGE_SHIFT (page_shift()) -#define PAGE_SIZE (page_size()) -/* - * On ppc64 this will only work with radix 2M hugepage size - */ -#define HPAGE_SHIFT 21 -#define HPAGE_SIZE (1 << HPAGE_SHIFT) - -#define PAGEMAP_PRESENT(ent) (((ent) & (1ull << 63)) != 0) -#define PAGEMAP_PFN(ent) ((ent) & ((1ull << 55) - 1)) - - -static inline int64_t allocate_transhuge(void *ptr, int pagemap_fd) -{ - uint64_t ent[2]; - - /* drop pmd */ - if (mmap(ptr, HPAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_FIXED | MAP_ANONYMOUS | - MAP_NORESERVE | MAP_PRIVATE, -1, 0) != ptr) - errx(2, "mmap transhuge"); - - if (madvise(ptr, HPAGE_SIZE, MADV_HUGEPAGE)) - err(2, "MADV_HUGEPAGE"); - - /* allocate transparent huge page */ - *(volatile void **)ptr = ptr; - - if (pread(pagemap_fd, ent, sizeof(ent), - (uintptr_t)ptr >> (PAGE_SHIFT - 3)) != sizeof(ent)) - err(2, "read pagemap"); - - if (PAGEMAP_PRESENT(ent[0]) && PAGEMAP_PRESENT(ent[1]) && - PAGEMAP_PFN(ent[0]) + 1 == PAGEMAP_PFN(ent[1]) && - !(PAGEMAP_PFN(ent[0]) & ((1 << (HPAGE_SHIFT - PAGE_SHIFT)) - 1))) - return PAGEMAP_PFN(ent[0]); - - return -1; -} - -#endif --- a/tools/testing/selftests/vm/va_128TBswitch.c +++ /dev/null @@ -1,289 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * - * Authors: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx> - * Authors: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxxxxxxx> - */ - -#include <stdio.h> -#include <sys/mman.h> -#include <string.h> - -#include "../kselftest.h" - -#ifdef __powerpc64__ -#define PAGE_SIZE (64 << 10) -/* - * This will work with 16M and 2M hugepage size - */ -#define HUGETLB_SIZE (16 << 20) -#else -#define PAGE_SIZE (4 << 10) -#define HUGETLB_SIZE (2 << 20) -#endif - -/* - * >= 128TB is the hint addr value we used to select - * large address space. - */ -#define ADDR_SWITCH_HINT (1UL << 47) -#define LOW_ADDR ((void *) (1UL << 30)) -#define HIGH_ADDR ((void *) (1UL << 48)) - -struct testcase { - void *addr; - unsigned long size; - unsigned long flags; - const char *msg; - unsigned int low_addr_required:1; - unsigned int keep_mapped:1; -}; - -static struct testcase testcases[] = { - { - /* - * If stack is moved, we could possibly allocate - * this at the requested address. - */ - .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), - .size = PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", - .low_addr_required = 1, - }, - { - /* - * We should never allocate at the requested address or above it - * The len cross the 128TB boundary. Without MAP_FIXED - * we will always search in the lower address space. - */ - .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, (2 * PAGE_SIZE))", - .low_addr_required = 1, - }, - { - /* - * Exact mapping at 128TB, the area is free we should get that - * even without MAP_FIXED. - */ - .addr = ((void *)(ADDR_SWITCH_HINT)), - .size = PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", - .keep_mapped = 1, - }, - { - .addr = (void *)(ADDR_SWITCH_HINT), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", - }, - { - .addr = NULL, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(NULL)", - .low_addr_required = 1, - }, - { - .addr = LOW_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(LOW_ADDR)", - .low_addr_required = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR)", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR) again", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(HIGH_ADDR, MAP_FIXED)", - }, - { - .addr = (void *) -1, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1)", - .keep_mapped = 1, - }, - { - .addr = (void *) -1, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1) again", - }, - { - .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), - .size = PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", - .low_addr_required = 1, - }, - { - .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2 * PAGE_SIZE)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE / 2), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE/2 , 2 * PAGE_SIZE)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = ((void *)(ADDR_SWITCH_HINT)), - .size = PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", - }, - { - .addr = (void *)(ADDR_SWITCH_HINT), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", - }, -}; - -static struct testcase hugetlb_testcases[] = { - { - .addr = NULL, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(NULL, MAP_HUGETLB)", - .low_addr_required = 1, - }, - { - .addr = LOW_ADDR, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(LOW_ADDR, MAP_HUGETLB)", - .low_addr_required = 1, - }, - { - .addr = HIGH_ADDR, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)", - }, - { - .addr = (void *) -1, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1, MAP_HUGETLB)", - .keep_mapped = 1, - }, - { - .addr = (void *) -1, - .size = HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1, MAP_HUGETLB) again", - }, - { - .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), - .size = 2 * HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2*HUGETLB_SIZE, MAP_HUGETLB)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = (void *)(ADDR_SWITCH_HINT), - .size = 2 * HUGETLB_SIZE, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(ADDR_SWITCH_HINT , 2*HUGETLB_SIZE, MAP_FIXED | MAP_HUGETLB)", - }, -}; - -static int run_test(struct testcase *test, int count) -{ - void *p; - int i, ret = KSFT_PASS; - - for (i = 0; i < count; i++) { - struct testcase *t = test + i; - - p = mmap(t->addr, t->size, PROT_READ | PROT_WRITE, t->flags, -1, 0); - - printf("%s: %p - ", t->msg, p); - - if (p == MAP_FAILED) { - printf("FAILED\n"); - ret = KSFT_FAIL; - continue; - } - - if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) { - printf("FAILED\n"); - ret = KSFT_FAIL; - } else { - /* - * Do a dereference of the address returned so that we catch - * bugs in page fault handling - */ - memset(p, 0, t->size); - printf("OK\n"); - } - if (!t->keep_mapped) - munmap(p, t->size); - } - - return ret; -} - -static int supported_arch(void) -{ -#if defined(__powerpc64__) - return 1; -#elif defined(__x86_64__) - return 1; -#else - return 0; -#endif -} - -int main(int argc, char **argv) -{ - int ret; - - if (!supported_arch()) - return KSFT_SKIP; - - ret = run_test(testcases, ARRAY_SIZE(testcases)); - if (argc == 2 && !strcmp(argv[1], "--run-hugetlb")) - ret = run_test(hugetlb_testcases, ARRAY_SIZE(hugetlb_testcases)); - return ret; -} --- a/tools/testing/selftests/vm/va_128TBswitch.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash -# SPDX-License-Identifier: GPL-2.0 -# -# Copyright (C) 2022 Adam Sindelar (Meta) <adam@xxxxxxxxxxxx> -# -# This is a test for mmap behavior with 5-level paging. This script wraps the -# real test to check that the kernel is configured to support at least 5 -# pagetable levels. - -# 1 means the test failed -exitcode=1 - -# Kselftest framework requirement - SKIP code is 4. -ksft_skip=4 - -fail() -{ - echo "$1" - exit $exitcode -} - -check_supported_x86_64() -{ - local config="/proc/config.gz" - [[ -f "${config}" ]] || config="/boot/config-$(uname -r)" - [[ -f "${config}" ]] || fail "Cannot find kernel config in /proc or /boot" - - # gzip -dcfq automatically handles both compressed and plaintext input. - # See man 1 gzip under '-f'. - local pg_table_levels=$(gzip -dcfq "${config}" | grep PGTABLE_LEVELS | cut -d'=' -f 2) - - if [[ "${pg_table_levels}" -lt 5 ]]; then - echo "$0: PGTABLE_LEVELS=${pg_table_levels}, must be >= 5 to run this test" - exit $ksft_skip - fi -} - -check_test_requirements() -{ - # The test supports x86_64 and powerpc64. We currently have no useful - # eligibility check for powerpc64, and the test itself will reject other - # architectures. - case `uname -m` in - "x86_64") - check_supported_x86_64 - ;; - *) - return 0 - ;; - esac -} - -check_test_requirements -./va_128TBswitch --- a/tools/testing/selftests/vm/virtual_address_range.c +++ /dev/null @@ -1,139 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * Copyright 2017, Anshuman Khandual, IBM Corp. - * - * Works on architectures which support 128TB virtual - * address range and beyond. - */ -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <unistd.h> -#include <errno.h> -#include <sys/mman.h> -#include <sys/time.h> - -/* - * Maximum address range mapped with a single mmap() - * call is little bit more than 16GB. Hence 16GB is - * chosen as the single chunk size for address space - * mapping. - */ -#define MAP_CHUNK_SIZE 17179869184UL /* 16GB */ - -/* - * Address space till 128TB is mapped without any hint - * and is enabled by default. Address space beyond 128TB - * till 512TB is obtained by passing hint address as the - * first argument into mmap() system call. - * - * The process heap address space is divided into two - * different areas one below 128TB and one above 128TB - * till it reaches 512TB. One with size 128TB and the - * other being 384TB. - * - * On Arm64 the address space is 256TB and no high mappings - * are supported so far. - */ - -#define NR_CHUNKS_128TB 8192UL /* Number of 16GB chunks for 128TB */ -#define NR_CHUNKS_256TB (NR_CHUNKS_128TB * 2UL) -#define NR_CHUNKS_384TB (NR_CHUNKS_128TB * 3UL) - -#define ADDR_MARK_128TB (1UL << 47) /* First address beyond 128TB */ -#define ADDR_MARK_256TB (1UL << 48) /* First address beyond 256TB */ - -#ifdef __aarch64__ -#define HIGH_ADDR_MARK ADDR_MARK_256TB -#define HIGH_ADDR_SHIFT 49 -#define NR_CHUNKS_LOW NR_CHUNKS_256TB -#define NR_CHUNKS_HIGH 0 -#else -#define HIGH_ADDR_MARK ADDR_MARK_128TB -#define HIGH_ADDR_SHIFT 48 -#define NR_CHUNKS_LOW NR_CHUNKS_128TB -#define NR_CHUNKS_HIGH NR_CHUNKS_384TB -#endif - -static char *hind_addr(void) -{ - int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT); - - return (char *) (1UL << bits); -} - -static int validate_addr(char *ptr, int high_addr) -{ - unsigned long addr = (unsigned long) ptr; - - if (high_addr) { - if (addr < HIGH_ADDR_MARK) { - printf("Bad address %lx\n", addr); - return 1; - } - return 0; - } - - if (addr > HIGH_ADDR_MARK) { - printf("Bad address %lx\n", addr); - return 1; - } - return 0; -} - -static int validate_lower_address_hint(void) -{ - char *ptr; - - ptr = mmap((void *) (1UL << 45), MAP_CHUNK_SIZE, PROT_READ | - PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (ptr == MAP_FAILED) - return 0; - - return 1; -} - -int main(int argc, char *argv[]) -{ - char *ptr[NR_CHUNKS_LOW]; - char *hptr[NR_CHUNKS_HIGH]; - char *hint; - unsigned long i, lchunks, hchunks; - - for (i = 0; i < NR_CHUNKS_LOW; i++) { - ptr[i] = mmap(NULL, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (ptr[i] == MAP_FAILED) { - if (validate_lower_address_hint()) - return 1; - break; - } - - if (validate_addr(ptr[i], 0)) - return 1; - } - lchunks = i; - - for (i = 0; i < NR_CHUNKS_HIGH; i++) { - hint = hind_addr(); - hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - - if (hptr[i] == MAP_FAILED) - break; - - if (validate_addr(hptr[i], 1)) - return 1; - } - hchunks = i; - - for (i = 0; i < lchunks; i++) - munmap(ptr[i], MAP_CHUNK_SIZE); - - for (i = 0; i < hchunks; i++) - munmap(hptr[i], MAP_CHUNK_SIZE); - - return 0; -} --- a/tools/testing/selftests/vm/vm_util.c +++ /dev/null @@ -1,151 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <string.h> -#include <fcntl.h> -#include "../kselftest.h" -#include "vm_util.h" - -#define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" -#define SMAP_FILE_PATH "/proc/self/smaps" -#define MAX_LINE_LENGTH 500 - -uint64_t pagemap_get_entry(int fd, char *start) -{ - const unsigned long pfn = (unsigned long)start / getpagesize(); - uint64_t entry; - int ret; - - ret = pread(fd, &entry, sizeof(entry), pfn * sizeof(entry)); - if (ret != sizeof(entry)) - ksft_exit_fail_msg("reading pagemap failed\n"); - return entry; -} - -bool pagemap_is_softdirty(int fd, char *start) -{ - uint64_t entry = pagemap_get_entry(fd, start); - - // Check if dirty bit (55th bit) is set - return entry & 0x0080000000000000ull; -} - -bool pagemap_is_swapped(int fd, char *start) -{ - uint64_t entry = pagemap_get_entry(fd, start); - - return entry & 0x4000000000000000ull; -} - -bool pagemap_is_populated(int fd, char *start) -{ - uint64_t entry = pagemap_get_entry(fd, start); - - /* Present or swapped. */ - return entry & 0xc000000000000000ull; -} - -unsigned long pagemap_get_pfn(int fd, char *start) -{ - uint64_t entry = pagemap_get_entry(fd, start); - - /* If present (63th bit), PFN is at bit 0 -- 54. */ - if (entry & 0x8000000000000000ull) - return entry & 0x007fffffffffffffull; - return -1ul; -} - -void clear_softdirty(void) -{ - int ret; - const char *ctrl = "4"; - int fd = open("/proc/self/clear_refs", O_WRONLY); - - if (fd < 0) - ksft_exit_fail_msg("opening clear_refs failed\n"); - ret = write(fd, ctrl, strlen(ctrl)); - close(fd); - if (ret != strlen(ctrl)) - ksft_exit_fail_msg("writing clear_refs failed\n"); -} - -bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len) -{ - while (fgets(buf, len, fp)) { - if (!strncmp(buf, pattern, strlen(pattern))) - return true; - } - return false; -} - -uint64_t read_pmd_pagesize(void) -{ - int fd; - char buf[20]; - ssize_t num_read; - - fd = open(PMD_SIZE_FILE_PATH, O_RDONLY); - if (fd == -1) - ksft_exit_fail_msg("Open hpage_pmd_size failed\n"); - - num_read = read(fd, buf, 19); - if (num_read < 1) { - close(fd); - ksft_exit_fail_msg("Read hpage_pmd_size failed\n"); - } - buf[num_read] = '\0'; - close(fd); - - return strtoul(buf, NULL, 10); -} - -bool __check_huge(void *addr, char *pattern, int nr_hpages, - uint64_t hpage_size) -{ - uint64_t thp = -1; - int ret; - FILE *fp; - char buffer[MAX_LINE_LENGTH]; - char addr_pattern[MAX_LINE_LENGTH]; - - ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", - (unsigned long) addr); - if (ret >= MAX_LINE_LENGTH) - ksft_exit_fail_msg("%s: Pattern is too long\n", __func__); - - fp = fopen(SMAP_FILE_PATH, "r"); - if (!fp) - ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, SMAP_FILE_PATH); - - if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) - goto err_out; - - /* - * Fetch the pattern in the same block and check the number of - * hugepages. - */ - if (!check_for_pattern(fp, pattern, buffer, sizeof(buffer))) - goto err_out; - - snprintf(addr_pattern, MAX_LINE_LENGTH, "%s%%9ld kB", pattern); - - if (sscanf(buffer, addr_pattern, &thp) != 1) - ksft_exit_fail_msg("Reading smap error\n"); - -err_out: - fclose(fp); - return thp == (nr_hpages * (hpage_size >> 10)); -} - -bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size) -{ - return __check_huge(addr, "AnonHugePages: ", nr_hpages, hpage_size); -} - -bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size) -{ - return __check_huge(addr, "FilePmdMapped:", nr_hpages, hpage_size); -} - -bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size) -{ - return __check_huge(addr, "ShmemPmdMapped:", nr_hpages, hpage_size); -} --- a/tools/testing/selftests/vm/vm_util.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#include <stdint.h> -#include <stdbool.h> - -uint64_t pagemap_get_entry(int fd, char *start); -bool pagemap_is_softdirty(int fd, char *start); -bool pagemap_is_swapped(int fd, char *start); -bool pagemap_is_populated(int fd, char *start); -unsigned long pagemap_get_pfn(int fd, char *start); -void clear_softdirty(void); -bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); -uint64_t read_pmd_pagesize(void); -bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); -bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size); -bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size); --- a/tools/testing/selftests/vm/write_hugetlb_memory.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0 - -set -e - -size=$1 -populate=$2 -write=$3 -cgroup=$4 -path=$5 -method=$6 -private=$7 -want_sleep=$8 -reserve=$9 - -echo "Putting task in cgroup '$cgroup'" -echo $$ > ${cgroup_path:-/dev/cgroup/memory}/"$cgroup"/cgroup.procs - -echo "Method is $method" - -set +e -./write_to_hugetlbfs -p "$path" -s "$size" "$write" "$populate" -m "$method" \ - "$private" "$want_sleep" "$reserve" --- a/tools/testing/selftests/vm/write_to_hugetlbfs.c +++ /dev/null @@ -1,240 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * This program reserves and uses hugetlb memory, supporting a bunch of - * scenarios needed by the charged_reserved_hugetlb.sh test. - */ - -#include <err.h> -#include <errno.h> -#include <signal.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <unistd.h> -#include <fcntl.h> -#include <sys/types.h> -#include <sys/shm.h> -#include <sys/stat.h> -#include <sys/mman.h> - -/* Global definitions. */ -enum method { - HUGETLBFS, - MMAP_MAP_HUGETLB, - SHM, - MAX_METHOD -}; - - -/* Global variables. */ -static const char *self; -static char *shmaddr; -static int shmid; - -/* - * Show usage and exit. - */ -static void exit_usage(void) -{ - printf("Usage: %s -p <path to hugetlbfs file> -s <size to map> " - "[-m <0=hugetlbfs | 1=mmap(MAP_HUGETLB)>] [-l] [-r] " - "[-o] [-w] [-n]\n", - self); - exit(EXIT_FAILURE); -} - -void sig_handler(int signo) -{ - printf("Received %d.\n", signo); - if (signo == SIGINT) { - printf("Deleting the memory\n"); - if (shmdt((const void *)shmaddr) != 0) { - perror("Detach failure"); - shmctl(shmid, IPC_RMID, NULL); - exit(4); - } - - shmctl(shmid, IPC_RMID, NULL); - printf("Done deleting the memory\n"); - } - exit(2); -} - -int main(int argc, char **argv) -{ - int fd = 0; - int key = 0; - int *ptr = NULL; - int c = 0; - int size = 0; - char path[256] = ""; - enum method method = MAX_METHOD; - int want_sleep = 0, private = 0; - int populate = 0; - int write = 0; - int reserve = 1; - - if (signal(SIGINT, sig_handler) == SIG_ERR) - err(1, "\ncan't catch SIGINT\n"); - - /* Parse command-line arguments. */ - setvbuf(stdout, NULL, _IONBF, 0); - self = argv[0]; - - while ((c = getopt(argc, argv, "s:p:m:owlrn")) != -1) { - switch (c) { - case 's': - size = atoi(optarg); - break; - case 'p': - strncpy(path, optarg, sizeof(path)); - break; - case 'm': - if (atoi(optarg) >= MAX_METHOD) { - errno = EINVAL; - perror("Invalid -m."); - exit_usage(); - } - method = atoi(optarg); - break; - case 'o': - populate = 1; - break; - case 'w': - write = 1; - break; - case 'l': - want_sleep = 1; - break; - case 'r': - private - = 1; - break; - case 'n': - reserve = 0; - break; - default: - errno = EINVAL; - perror("Invalid arg"); - exit_usage(); - } - } - - if (strncmp(path, "", sizeof(path)) != 0) { - printf("Writing to this path: %s\n", path); - } else { - errno = EINVAL; - perror("path not found"); - exit_usage(); - } - - if (size != 0) { - printf("Writing this size: %d\n", size); - } else { - errno = EINVAL; - perror("size not found"); - exit_usage(); - } - - if (!populate) - printf("Not populating.\n"); - else - printf("Populating.\n"); - - if (!write) - printf("Not writing to memory.\n"); - - if (method == MAX_METHOD) { - errno = EINVAL; - perror("-m Invalid"); - exit_usage(); - } else - printf("Using method=%d\n", method); - - if (!private) - printf("Shared mapping.\n"); - else - printf("Private mapping.\n"); - - if (!reserve) - printf("NO_RESERVE mapping.\n"); - else - printf("RESERVE mapping.\n"); - - switch (method) { - case HUGETLBFS: - printf("Allocating using HUGETLBFS.\n"); - fd = open(path, O_CREAT | O_RDWR, 0777); - if (fd == -1) - err(1, "Failed to open file."); - - ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, - (private ? MAP_PRIVATE : MAP_SHARED) | - (populate ? MAP_POPULATE : 0) | - (reserve ? 0 : MAP_NORESERVE), - fd, 0); - - if (ptr == MAP_FAILED) { - close(fd); - err(1, "Error mapping the file"); - } - break; - case MMAP_MAP_HUGETLB: - printf("Allocating using MAP_HUGETLB.\n"); - ptr = mmap(NULL, size, PROT_READ | PROT_WRITE, - (private ? (MAP_PRIVATE | MAP_ANONYMOUS) : - MAP_SHARED) | - MAP_HUGETLB | (populate ? MAP_POPULATE : 0) | - (reserve ? 0 : MAP_NORESERVE), - -1, 0); - - if (ptr == MAP_FAILED) - err(1, "mmap"); - - printf("Returned address is %p\n", ptr); - break; - case SHM: - printf("Allocating using SHM.\n"); - shmid = shmget(key, size, - SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); - if (shmid < 0) { - shmid = shmget(++key, size, - SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); - if (shmid < 0) - err(1, "shmget"); - } - printf("shmid: 0x%x, shmget key:%d\n", shmid, key); - - ptr = shmat(shmid, NULL, 0); - if (ptr == (int *)-1) { - perror("Shared memory attach failure"); - shmctl(shmid, IPC_RMID, NULL); - exit(2); - } - printf("shmaddr: %p\n", ptr); - - break; - default: - errno = EINVAL; - err(1, "Invalid method."); - } - - if (write) { - printf("Writing to memory.\n"); - memset(ptr, 1, size); - } - - if (want_sleep) { - /* Signal to caller that we're done. */ - printf("DONE\n"); - - /* Hold memory until external kill signal is delivered. */ - while (1) - sleep(100); - } - - if (method == HUGETLBFS) - close(fd); - - return 0; -} _ Patches currently in -mm which might be from sj@xxxxxxxxxx are mm-damon-core-implement-damos-filter.patch mm-damon-paddr-support-damos-filters.patch mm-damon-reclaim-add-a-parameter-called-skip_anon-for-avoiding-anonymous-pages-reclamation.patch docs-admin-guide-damon-reclaim-document-skip_anon-parameter.patch mm-damon-sysfs-schemes-implement-filters-directory.patch mm-damon-sysfs-schemes-implement-filter-directory.patch mm-damon-sysfs-schemes-connect-filter-directory-and-filters-directory.patch mm-damon-sysfs-schemes-implement-scheme-filters.patch mm-damon-sysfs-schemes-implement-scheme-filters-fix.patch mm-damon-sysfs-schemes-implement-scheme-filters-fix-fix-2.patch selftests-damon-sysfs-test-filters-directory.patch docs-admin-guide-mm-damon-usage-document-damos-filters-of-sysfs.patch docs-abi-damon-document-scheme-filters-files.patch mm-page_reporting-replace-rcu_access_pointer-with-rcu_dereference_protected.patch maintainers-add-types-to-akpm-mm-git-trees-entries.patch maintainers-memory-management-add-tools-vm-as-managed-files.patch tools-vm-rename-tools-vm-to-tools-mm.patch selftests-vm-rename-selftets-vm-to-selftests-mm.patch docs-admin-guide-mm-numaperf-increase-depth-of-subsections.patch scripts-spelling-add-a-few-more-typos.patch