This is an automated email from the git hooks/post-receive script. jforbes pushed a commit to branch master in repository kernel-tests. commit f76b8121a1a5691d38d525f3b9652506ed1e8090 Author: Josh Boyer <jwboyer@xxxxxxxxxx> Date: Tue Jun 12 15:46:54 2012 -0400 Add libhugetlbfs tests --- default/libhugetlbfs/libhugetlbfs/.gitignore | 5 + default/libhugetlbfs/libhugetlbfs/HOWTO | 765 +++++++++ default/libhugetlbfs/libhugetlbfs/LGPL-2.1 | 510 ++++++ default/libhugetlbfs/libhugetlbfs/Makefile | 437 +++++ default/libhugetlbfs/libhugetlbfs/NEWS | 357 ++++ default/libhugetlbfs/libhugetlbfs/README | 42 + .../libhugetlbfs/libhugetlbfs/TLBC/DataCollect.pm | 55 + .../libhugetlbfs/libhugetlbfs/TLBC/OpCollect.pm | 185 +++ .../libhugetlbfs/libhugetlbfs/TLBC/PerfCollect.pm | 144 ++ default/libhugetlbfs/libhugetlbfs/TLBC/Report.pm | 58 + default/libhugetlbfs/libhugetlbfs/alloc.c | 337 ++++ .../libhugetlbfs/contrib/tlbmiss_cost.sh | 693 ++++++++ default/libhugetlbfs/libhugetlbfs/cpupcstat | 337 ++++ default/libhugetlbfs/libhugetlbfs/debug.c | 50 + default/libhugetlbfs/libhugetlbfs/elf32ppclinux.c | 54 + default/libhugetlbfs/libhugetlbfs/elf64ppc.c | 54 + default/libhugetlbfs/libhugetlbfs/elflink.c | 1333 +++++++++++++++ .../libhugetlbfs/huge_page_setup_helper.py | 343 ++++ default/libhugetlbfs/libhugetlbfs/hugeadm.c | 1699 ++++++++++++++++++++ default/libhugetlbfs/libhugetlbfs/hugectl.c | 488 ++++++ default/libhugetlbfs/libhugetlbfs/hugeedit.c | 240 +++ default/libhugetlbfs/libhugetlbfs/hugetlbfs.h | 79 + default/libhugetlbfs/libhugetlbfs/hugeutils.c | 1184 ++++++++++++++ default/libhugetlbfs/libhugetlbfs/init.c | 39 + default/libhugetlbfs/libhugetlbfs/init_privutils.c | 27 + .../libhugetlbfs/libhugetlbfs/kernel-features.c | 271 ++++ .../libhugetlbfs/libhugetlbfs/kernel-features.h | 30 + default/libhugetlbfs/libhugetlbfs/ld.hugetlbfs | 84 + .../libhugetlbfs/ldscripts/elf32ppclinux.xB | 254 +++ .../libhugetlbfs/ldscripts/elf32ppclinux.xBDT | 245 +++ .../libhugetlbfs/ldscripts/elf64ppc.xB | 245 +++ .../libhugetlbfs/ldscripts/elf64ppc.xBDT | 241 +++ .../libhugetlbfs/ldscripts/elf_i386.xB | 200 +++ .../libhugetlbfs/ldscripts/elf_i386.xBDT | 198 +++ .../libhugetlbfs/ldscripts/elf_x86_64.xB | 202 +++ .../libhugetlbfs/ldscripts/elf_x86_64.xBDT | 202 +++ .../libhugetlbfs/libhugetlbfs/libhugetlbfs_debug.h | 42 + .../libhugetlbfs/libhugetlbfs_internal.h | 210 +++ .../libhugetlbfs/libhugetlbfs_privutils.h | 94 ++ .../libhugetlbfs/libhugetlbfs_testprobes.h | 39 + default/libhugetlbfs/libhugetlbfs/localversion | 90 ++ default/libhugetlbfs/libhugetlbfs/man/cpupcstat.8 | 117 ++ .../libhugetlbfs/libhugetlbfs/man/get_huge_pages.3 | 73 + .../libhugetlbfs/man/get_hugepage_region.3 | 88 + .../libhugetlbfs/man/gethugepagesizes.3 | 69 + .../libhugetlbfs/libhugetlbfs/man/getpagesizes.3 | 70 + default/libhugetlbfs/libhugetlbfs/man/hugeadm.8 | 294 ++++ default/libhugetlbfs/libhugetlbfs/man/hugectl.8 | 141 ++ default/libhugetlbfs/libhugetlbfs/man/hugeedit.8 | 57 + .../libhugetlbfs/libhugetlbfs/man/libhugetlbfs.7 | 212 +++ default/libhugetlbfs/libhugetlbfs/man/pagesize.1 | 57 + .../libhugetlbfs/man/tlbmiss_cost.sh.8 | 85 + default/libhugetlbfs/libhugetlbfs/mktarball | 32 + default/libhugetlbfs/libhugetlbfs/morecore.c | 366 +++++ .../libhugetlbfs/oprofile_map_events.pl | 146 ++ .../libhugetlbfs/libhugetlbfs/oprofile_start.sh | 85 + default/libhugetlbfs/libhugetlbfs/pagesize.c | 140 ++ default/libhugetlbfs/libhugetlbfs/privutils.lds | 6 + default/libhugetlbfs/libhugetlbfs/shm.c | 143 ++ .../libhugetlbfs/libhugetlbfs/sys-elf32ppclinux.S | 34 + default/libhugetlbfs/libhugetlbfs/sys-elf64ppc.S | 43 + default/libhugetlbfs/libhugetlbfs/sys-elf_i386.S | 42 + default/libhugetlbfs/libhugetlbfs/sys-elf_x86_64.S | 34 + default/libhugetlbfs/libhugetlbfs/tests/.gitignore | 4 + default/libhugetlbfs/libhugetlbfs/tests/Makefile | 284 ++++ .../libhugetlbfs/tests/alloc-instantiate-race.c | 274 ++++ .../libhugetlbfs/tests/bad-toolchain.sh | 5 + .../libhugetlbfs/tests/brk_near_huge.c | 114 ++ .../libhugetlbfs/tests/chunk-overcommit.c | 114 ++ .../libhugetlbfs/tests/compare_kvers.c | 41 + default/libhugetlbfs/libhugetlbfs/tests/counters.c | 414 +++++ .../libhugetlbfs/libhugetlbfs/tests/counters.sh | 13 + default/libhugetlbfs/libhugetlbfs/tests/direct.c | 101 ++ default/libhugetlbfs/libhugetlbfs/tests/dummy.c | 31 + .../libhugetlbfs/libhugetlbfs/tests/empty_mounts.c | 69 + .../libhugetlbfs/tests/fadvise_reserve.c | 86 + .../libhugetlbfs/tests/fadvise_reserve.sh | 14 + .../libhugetlbfs/libhugetlbfs/tests/find_path.c | 44 + default/libhugetlbfs/libhugetlbfs/tests/fork-cow.c | 176 ++ .../libhugetlbfs/tests/get_huge_pages.c | 76 + .../libhugetlbfs/tests/get_hugepage_region.c | 137 ++ .../libhugetlbfs/tests/get_hugetlbfs_path.c | 40 + .../libhugetlbfs/tests/gethugepagesize.c | 44 + .../libhugetlbfs/tests/gethugepagesizes.c | 412 +++++ .../libhugetlbfs/tests/heap-overflow.c | 110 ++ .../libhugetlbfs/tests/heapshrink-helper.c | 25 + .../libhugetlbfs/libhugetlbfs/tests/heapshrink.c | 74 + .../libhugetlbfs/tests/huge_at_4GB_normal_below.c | 94 ++ .../tests/huge_below_4GB_normal_above.c | 117 ++ .../libhugetlbfs/libhugetlbfs/tests/hugetests.h | 142 ++ .../libhugetlbfs/tests/icache-hygiene.c | 215 +++ .../libhugetlbfs/libhugetlbfs/tests/large_mounts.c | 117 ++ .../libhugetlbfs/libhugetlbfs/tests/libtestutils.c | 138 ++ default/libhugetlbfs/libhugetlbfs/tests/linkhuge.c | 176 ++ .../libhugetlbfs/tests/linkhuge_nofd.c | 42 + .../libhugetlbfs/libhugetlbfs/tests/linkhuge_rw.c | 210 +++ .../libhugetlbfs/libhugetlbfs/tests/linkshare.c | 373 +++++ .../libhugetlbfs/tests/madvise_reserve.c | 81 + .../libhugetlbfs/tests/madvise_reserve.sh | 14 + default/libhugetlbfs/libhugetlbfs/tests/malloc.c | 87 + .../libhugetlbfs/tests/malloc_manysmall.c | 76 + .../libhugetlbfs/tests/map_high_truncate_2.c | 100 ++ .../libhugetlbfs/tests/meminfo_nohuge.c | 79 + default/libhugetlbfs/libhugetlbfs/tests/misalign.c | 121 ++ .../libhugetlbfs/tests/misaligned_offset.c | 140 ++ default/libhugetlbfs/libhugetlbfs/tests/mlock.c | 72 + default/libhugetlbfs/libhugetlbfs/tests/mmap-cow.c | 182 +++ .../libhugetlbfs/libhugetlbfs/tests/mmap-gettest.c | 127 ++ default/libhugetlbfs/libhugetlbfs/tests/mprotect.c | 217 +++ .../tests/mremap-expand-slice-collision.c | 188 +++ .../tests/mremap-expand-slice-collision.sh | 14 + .../tests/mremap-fixed-huge-near-normal.c | 145 ++ .../tests/mremap-fixed-huge-near-normal.sh | 14 + .../tests/mremap-fixed-normal-near-huge.c | 124 ++ .../tests/mremap-fixed-normal-near-huge.sh | 14 + default/libhugetlbfs/libhugetlbfs/tests/private.c | 92 ++ .../libhugetlbfs/tests/ptrace-write-hugepage.c | 162 ++ default/libhugetlbfs/libhugetlbfs/tests/quota.c | 271 ++++ default/libhugetlbfs/libhugetlbfs/tests/quota.sh | 13 + .../libhugetlbfs/tests/readahead_reserve.c | 86 + .../libhugetlbfs/tests/readahead_reserve.sh | 14 + default/libhugetlbfs/libhugetlbfs/tests/readback.c | 64 + .../libhugetlbfs/libhugetlbfs/tests/run_tests.py | 704 ++++++++ default/libhugetlbfs/libhugetlbfs/tests/shared.c | 71 + default/libhugetlbfs/libhugetlbfs/tests/shm-fork.c | 136 ++ .../libhugetlbfs/libhugetlbfs/tests/shm-getraw.c | 106 ++ .../libhugetlbfs/libhugetlbfs/tests/shm-gettest.c | 110 ++ .../libhugetlbfs/libhugetlbfs/tests/shm-perms.c | 131 ++ .../libhugetlbfs/tests/shmoverride_unlinked.c | 248 +++ .../libhugetlbfs/libhugetlbfs/tests/slbpacaflush.c | 96 ++ .../libhugetlbfs/tests/stack_grow_into_huge.c | 140 ++ .../libhugetlbfs/libhugetlbfs/tests/straddle_4GB.c | 108 ++ .../libhugetlbfs/tests/task-size-overrun.c | 131 ++ .../libhugetlbfs/libhugetlbfs/tests/test_root.c | 39 + .../libhugetlbfs/libhugetlbfs/tests/testutils.c | 298 ++++ default/libhugetlbfs/libhugetlbfs/tests/truncate.c | 79 + .../libhugetlbfs/tests/truncate_above_4GB.c | 157 ++ .../tests/truncate_reserve_wraparound.c | 130 ++ .../tests/truncate_sigbus_versus_oom.c | 100 ++ .../libhugetlbfs/libhugetlbfs/tests/unlinked_fd.c | 60 + .../libhugetlbfs/tests/wrapper-utils.sh | 56 + .../libhugetlbfs/tests/zero_filesize_segment.c | 60 + .../libhugetlbfs/tests/zero_filesize_segment.ld | 7 + default/libhugetlbfs/libhugetlbfs/version.c | 3 + default/libhugetlbfs/libhugetlbfs/version.lds | 28 + default/libhugetlbfs/runtest.sh | 49 + 146 files changed, 24366 insertions(+) diff --git a/default/libhugetlbfs/libhugetlbfs/.gitignore b/default/libhugetlbfs/libhugetlbfs/.gitignore new file mode 100644 index 0000000..4b147f1 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/.gitignore @@ -0,0 +1,5 @@ +*.d +*.o +*.a +*.so +*~ diff --git a/default/libhugetlbfs/libhugetlbfs/HOWTO b/default/libhugetlbfs/libhugetlbfs/HOWTO new file mode 100644 index 0000000..8db958d --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/HOWTO @@ -0,0 +1,765 @@ +libhugetlbfs HOWTO +================== + +Author: David Gibson <dwg@xxxxxxxxxxx>, Adam Litke <agl@xxxxxxxxxx>, and others +Last updated: December 07, 2011 + +Introduction +============ + +In Linux(TM), access to hugepages is provided through a virtual file +system, "hugetlbfs". The libhugetlbfs library interface works with +hugetlbfs to provide more convenient specific application-level +services. In particular libhugetlbfs has three main functions: + + * library functions +libhugetlbfs provides functions that allow an applications to +explicitly allocate and use hugepages more easily they could by +directly accessing the hugetblfs filesystem + + * hugepage malloc() +libhugetlbfs can be used to make an existing application use hugepages +for all its malloc() calls. This works on an existing (dynamically +linked) application binary without modification. + + * hugepage text/data/BSS +libhugetlbfs, in conjunction with included special linker scripts can +be used to make an application which will store its executable text, +its initialized data or BSS, or all of the above in hugepages. This +requires relinking an application, but does not require source-level +modifications. + +This HOWTO explains how to use the libhugetlbfs library. It is for +application developers or system administrators who wish to use any of +the above functions. + +The libhugetlbfs library is a focal point to simplify and standardise +the use of the kernel API. + +Prerequisites +============= + +Hardware prerequisites +---------------------- + +You will need a CPU with some sort of hugepage support, which is +handled by your kernel. The covers recent x86, AMD64 and 64-bit +PowerPC(R) (POWER4, PPC970 and later) CPUs. + +Currently, only x86, AMD64 and PowerPC are fully supported by +libhugetlbfs. IA64 and Sparc64 have a working malloc, and SH64 +should also but it has not been tested. IA64, Sparc64, and SH64 +do not support segment remapping at this time. + +Kernel prerequisites +-------------------- + +To use all the features of libhugetlbfs you will need a 2.6.16 or +later kernel. Many things will work with earlier kernels, but they +have important bugs and missing features. The later sections of the +HOWTO assume a 2.6.16 or later kernel. The kernel must also have +hugepages enabled, that is to say the CONFIG_HUGETLB_PAGE and +CONFIG_HUGETLBFS options must be switched on. + +To check if hugetlbfs is enabled, use one of the following methods: + + * (Preferred) Use "grep hugetlbfs /proc/filesystems" to see if + hugetlbfs is a supported file system. + * On kernels which support /proc/config.gz (for example SLES10 + kernels), you can search for the CONFIG_HUGETLB_PAGE and + CONFIG_HUGETLBFS options in /proc/config.gz + * Finally, attempt to mount hugetlbfs. If it works, the required + hugepage support is enabled. + +Any kernel which meets the above test (even old ones) should support +at least basic libhugetlbfs functions, although old kernels may have +serious bugs. + +The MAP_PRIVATE flag instructs the kernel to return a memory area that +is private to the requesting process. To use MAP_PRIVATE mappings, +libhugetlbfs's automatic malloc() (morecore) feature, or the hugepage +text, data, or BSS features, you will need a kernel with hugepage +Copy-on-Write (CoW) support. The 2.6.16 kernel has this. + +PowerPC note: The malloc()/morecore features will generate warnings if +used on PowerPC chips with a kernel where hugepage mappings don't +respect the mmap() hint address (the "hint address" is the first +parameter to mmap(), when MAP_FIXED is not specified; the kernel is +not required to mmap() at this address, but should do so when +possible). 2.6.16 and later kernels do honor the hint address. +Hugepage malloc()/morecore should still work without this patch, but +the size of the hugepage heap will be limited (to around 256M for +32-bit and 1TB for 64-bit). + +The 2.6.27 kernel introduced support for multiple huge page sizes for +systems with the appropriate hardware support. Unless specifically +requested, libhugetlbfs will continue to use the default huge page size. + +Toolchain prerequisites +----------------------- + +The library uses a number of GNU specific features, so you will need to use +both gcc and GNU binutils. For PowerPC and AMD64 systems you will need a +"biarch" compiler, which can build both 32-bit and 64-bit binaries. To use +hugepage text and data segments, GNU binutils version 2.17 (or later) is +recommended. Older versions will work with restricted functionality. + +Configuration prerequisites +--------------------------- + +Direct access to hugepage pool has been deprecated in favor of the +hugeadm utility. This utility can be used for finding the available +hugepage pools and adjusting their minimum and maximum sizes depending +on kernel support. + +To list all availabe hugepage pools and their current min and max values: + hugeadm --pool-list + +To set the 2MB pool minimum to 10 pages: + hugeadm --pool-pages-min 2MB:10 + +Note: that the max pool size will be adjusted to keep the same number of +overcommit pages available if the kernel support is available when min +pages are adjusted + +To add 15 pages to the maximum for 2MB pages: + hugeadm --pool-pages-min 2MB:-5 + +For more information see man 8 hugeadm + +The raw kernel interfaces (as described below) are still available. + +In kernels before 2.6.24, hugepages must be allocated at boot-time via +the hugepages= command-line parameter or at run-time via the +/proc/sys/vm/nr_hugepages sysctl. If memory is restricted on the system, +boot-time allocation is recommended. Hugepages so allocated will be in +the static hugepage pool. + +In kernels starting with 2.6.24, the hugepage pool can grown on-demand. +If this feature should be used, /proc/sys/vm/nr_overcommit_hugepages +should be set to the maximum size of the hugepage pool. No hugepages +need to be allocated via /proc/sys/vm/nr_hugepages or hugepages= in this +case. Hugepages so allocated will be in the dynamic hugepage pool. + +For the running of the libhugetlbfs testsuite (see below), allocating 25 +static hugepages is recommended. Due to memory restrictions, the number +of hugepages requested may not be allocated if the allocation is +attempted at run-time. Users should verify the actual number of +hugepages allocated by: + + hugeadm --pool-list + +or + + grep HugePages_Total /proc/meminfo + +With 25 hugepages allocated, most tests should succeed. However, with +smaller hugepages sizes, many more hugepages may be necessary. + +To use libhugetlbfs features, as well as to run the testsuite, hugetlbfs +must be mounted. Each hugetlbfs mount point is associated with a page +size. To choose the size, use the pagesize mount option. If this option +is omitted, the default huge page size will be used. + +To mount the default huge page size: + + mkdir -p /mnt/hugetlbfs + mount -t hugetlbfs none /mnt/hugetlbfs + +To mount 64KB pages (assuming hardware support): + + mkdir -p /mnt/hugetlbfs-64K + mount -t hugetlbfs none -opagesize=64k /mnt/hugetlbfs-64K + +If hugepages should be available to non-root users, the permissions on +the mountpoint need to be set appropriately. + +Installation +============ + +1. Type "make" to build the library + +This will create "obj32" and/or "obj64" under the top level +libhugetlbfs directory, and build, respectively, 32-bit and 64-bit +shared and static versions (as applicable) of the library into each +directory. This will also build (but not run) the testsuite. + +On i386 systems, only the 32-bit library will be built. On PowerPC +and AMD64 systems, both 32-bit and 64-bit versions will be built (the +32-bit AMD64 version is identical to the i386 version). + +2. Run the testsuite with "make check" + +Running the testsuite is a good idea to ensure that the library is +working properly, and is quite quick (under 3 minutes on a 2GHz Apple +G5). "make func" will run the just the functionality tests, rather +than stress tests (a subset of "make check") which is much quicker. +The testsuite contains tests both for the library's features and for +the underlying kernel hugepage functionality. + +NOTE: The testsuite must be run as the root user. + +WARNING: The testsuite contains testcases explicitly designed to test +for a number of hugepage related kernel bugs uncovered during the +library's development. Some of these testcases WILL CRASH HARD a +kernel without the relevant fixes. 2.6.16 contains all such fixes for +all testcases included as of this writing. + +3. (Optional) Install to system paths with "make install" + +This will install the library images to the system lib/lib32/lib64 as +appropriate, the helper utilities and the manual pages. By default +it will install under /usr/local. To put it somewhere else use +PREFIX=/path/to/install on the make command line. For example: + + make install PREFIX=/opt/hugetlbfs +Will install under /opt/hugetlbfs. + +"make install" will also install the linker scripts and wrapper for ld +used for hugepage test/data/BSS (see below for details). + +Alternatively, you can use the library from the directory in which it +was built, using the LD_LIBRARY_PATH environment variable. + +To only install library with linker scripts, the manual pages or the helper +utilities separetly, use the install-libs, install-man and install-bin targets +respectively. This can be useful when you with to install the utilities but +not override the distribution-supported version of libhugetlbfs for example. + +Usage +===== + +Using hugepages for malloc() (morecore) +--------------------------------------- + +This feature allows an existing (dynamically linked) binary executable +to use hugepages for all its malloc() calls. To run a program using +the automatic hugepage malloc() feature, you must set several +environment variables: + +1. Set LD_PRELOAD=libhugetlbfs.so + This tells the dynamic linker to load the libhugetlbfs shared + library, even though the program wasn't originally linked against it. + + Note: If the program is linked against libhugetlbfs, preloading the + library may lead to application crashes. You should skip this + step in that case. + +2. Set LD_LIBRARY_PATH to the directory containing libhugetlbfs.so + This is only necessary if you haven't installed libhugetlbfs.so to a + system default path. If you set LD_LIBRARY_PATH, make sure the + directory referenced contains the right version of the library + (32-bit or 64-bit) as appropriate to the binary you want to run. + +3. Set HUGETLB_MORECORE + This enables the hugepage malloc() feature, instructing libhugetlbfs + to override libc's normal morecore() function with a hugepage + version and use it for malloc(). From this point all malloc()s + should come from hugepage memory until it runs out. This option can + be specified in two ways: + + To use the default huge page size: + HUGETLB_MORECORE=yes + + To use a specific huge page size: + HUGETLB_MORECORE=<pagesize> + + To use Transparent Huge Pages (THP): + HUGETLB_MORECORE=thp + +Note: This option requires a kernel that supports Transparent Huge Pages + +Usually it's preferable to set these environment variables on the +command line of the program you wish to run, rather than using +"export", because you'll only want to enable the hugepage malloc() for +particular programs, not everything. + +Examples: + +If you've installed libhugetlbfs in the default place (under +/usr/local) which is in the system library search path use: + $ LD_PRELOAD=libhugetlbfs.so HUGETLB_MORECORE=yes <your app command line> + +If you have built libhugetlbfs in ~/libhugetlbfs and haven't installed +it yet, the following would work for a 64-bit program: + + $ LD_PRELOAD=libhugetlbfs.so LD_LIBRARY_PATH=~/libhugetlbfs/obj64 \ + HUGETLB_MORECORE=yes <your app command line> + +Under some circumstances, you might want to specify the address where +the hugepage heap is located. You can do this by setting the +HUGETLB_MORECORE_HEAPBASE environment variable to the heap address in +hexadecimal. NOTE: this will not work on PowerPC systems with old kernels +which don't respect the hugepage hint address; see Kernel Prerequisites +above. Also note that this option is ignored for THP morecore. + +By default, the hugepage heap begins at roughly the same place a +normal page heap would, rounded up by an amount determined by your +platform. For 32-bit PowerPC binaries the normal page heap address is +rounded-up to a multiple of 256MB (that is, putting it in the next MMU +segment); for 64-bit PowerPC binaries the address is rounded-up to a +multiple of 1TB. On all other platforms the address is rounded-up to +the size of a hugepage. + +By default, the hugepage heap will be prefaulted by libhugetlbfs to +guarantee enough hugepages exist and are reserved for the application +(if this was not done, applications could receive a SIGKILL signal if +hugepages needed for the heap are used by another application before +they are faulted in). This leads to local-node allocations when no +memory policy is in place for hugepages. Therefore, it is recommended to +use + + $ numactl --interleave=all <your app command line> + +to regain some of the performance impact of local-node allocations on +large NUMA systems. This can still result in poor performance for those +applications which carefully place their threads on particular nodes +(such as by using OpenMP). In that case, thread-local allocation is +preferred so users should select a memory policy that corresponds to +the run-time behavior of the process' CPU usage. Users can specify +HUGETLB_NO_PREFAULT to prevent the prefaulting of hugepages and instead +rely on run-time faulting of hugepages. NOTE: specifying +HUGETLB_NO_PREFAULT on a system where hugepages are available to and +used by many process can result in some applications receving SIGKILL, +so its use is not recommended in high-availability or production +environments. + +By default, the hugepage heap does not shrink. To enable hugepage heap +shrinking, set HUGETLB_MORECORE_SHRINK=yes. NB: We have been seeing some +unexpected behavior from glibc's malloc when this is enabled. + +Using hugepage shared memory +---------------------------- + +Hugepages are used for shared memory segments if the SHM_HUGETLB flag is +set when calling shmget() and the pool is large enough. For hugepage-unaware +applications, libhugetlbfs overrides shmget and adds the SHM_HUGETLB if the +environment variable HUGETLB_SHM is set to "yes". The steps to use hugepages +with applications not linked to libhugetlbfs are similar to morecore except +for step 3. + +1. Set LD_PRELOAD=libhugetlbfs.so + This tells the dynamic linker to load the libhugetlbfs shared + library, even though the program wasn't originally linked against it. + + Note: If the program is linked against libhugetlbfs, preloading the + library may lead to application crashes. You should skip this + step in that case. + +2. Set LD_LIBRARY_PATH to the directory containing libhugetlbfs.so + This is only necessary if you haven't installed libhugetlbfs.so to a + system default path. If you set LD_LIBRARY_PATH, make sure the + directory referenced contains the right version of the library + (32-bit or 64-bit) as appropriate to the binary you want to run. + +3. Set HUGETLB_SHM=yes + The shmget() call is overridden whether the application is linked or the + libhugetlbfs library is preloaded. When this environment variable is set, + the SHM_HUGETLB flag is added to the call and the size parameter is aligned + to back the shared memory segment with huge pages. In the event hugepages + cannot be used, small pages will be used instead and a warning will be + printed to explain the failure. + + Note: It is not possible to select any huge page size other than the + system default for this option. If the kernel supports multiple + huge page sizes, the size used for shared memory can be changed by + altering the default huge page size via the default_hugepagesz + kernel boot parameter. + +Using hugepage text, data, or BSS +--------------------------------- + +To use the hugepage text, data, or BSS segments feature, you need to specially +link your application. How this is done depends on the version of GNU ld. To +support ld versions older than 2.17, libhugetlbfs provides custom linker +scripts that must be used to achieve the required binary layout. With version +2.17 or later, the system default linker scripts should be used. + +To link an application for hugepages, you should use the the ld.hugetlbfs +script included with libhugetlbfs in place of your normal linker. Without any +special options this will simply invoke GNU ld with the same parameters. When +it is invoked with options detailed in the following sections, ld.hugetlbfs +will call the system linker with all of the options necessary to link for +hugepages. If a custom linker script is required, it will also be selected. + +If you installed ld.hugetlbfs using "make install", or if you run it +from the place where you built libhugetlbfs, it should automatically +be able to find the libhugetlbfs linker scripts. Otherwise you may +need to explicitly instruct it where to find the scripts with the +option: + --hugetlbfs-script-path=/path/to/scripts +(The linker scripts are in the ldscripts/ subdirectory of the +libhugetlbfs source tree). + + Linking the application with binutils-2.17 or later: + ---------------------------------------------------- + +This method will use the system default linker scripts. Only one linker option +is required to prepare the application for hugepages: + + --hugetlbfs-align + +will instruct ld.hugetlbfs to call GNU ld with two options that increase the +alignment of the resulting binary. For reference, the options passed to ld are: + + -z common-page-size=<value> and + -z max-page-size=<value> + + Linking the application with binutils-2.16 or older: + ---------------------------------------------------- + +To link a program with a custom linker script, one of the following linker +options should be specified: + + --hugetlbfs-link=B + +will link the application to store BSS data (only) into hugepages + + --hugetlbfs-link=BDT + +will link the application to store text, initialized data and BSS data +into hugepages. + +These are the only two available options when using custom linker scripts. + + A note about the custom libhugetlbfs linker scripts: + ---------------------------------------------------- + +Linker scripts are usually distributed with GNU binutils and they may contain a +partial implementation of new linker features. As binutils evolves, the linker +scripts supplied with previous versions become obsolete and are upgraded. + +Libhugetlbfs distributes one set of linker scripts that must work across +several Linux distributions and binutils versions. This has worked well for +some time but binutils-2.17 (including some late 2.16 builds) have made changes +that are impossible to accomodate without breaking the libhugetlbfs linker +scripts for older versions of binutils. This is why the linker scripts (and +the --hugetlbfs-link ld.hugetlbfs option) have been deprecated for binutils >= +2.17 configurations. + +If you are using a late 2.16 binutils version (such as 2.16.91) and are +experiencing problems with huge page text, data, and bss, you can check +binutils for the incompatibility with the following command: + + ld --verbose | grep SPECIAL + +If any matches are returned, then the libhugetlbfs linker scripts may not work +correctly. In this case you should upgrade to binutils >= 2.17 and use the +--hugetlbfs-align linking method. + + Linking via gcc: + ---------------- + +In many cases it's normal to link an application by invoking gcc, +which will then invoke the linker with appropriate options, rather +than invoking ld directly. In such cases it's usually best to +convince gcc to invoke the ld.hugetlbfs script instead of the system +linker, rather than modifying your build procedure to invoke the +ld.hugetlbfs directly; the compilers may often add special libraries +or other linker options which can be fiddly to reproduce by hand. +To make this easier, 'make install' will install ld.hugetlbfs into +$PREFIX/share/libhugetlbfs and create an 'ld' symlink to it. + +Then with gcc, you invoke it as a linker with two options: + + -B $PREFIX/share/libhugetlbfs + +This option tells gcc to look in a non-standard location for the +linker, thus finding our script rather than the normal linker. This +can optionally be set in the CFLAGS environment variable. + + -Wl,--hugetlbfs-align +OR -Wl,--hugetlbfs-link=B +OR -Wl,--hugetlbfs-link=BDT + +This option instructs gcc to pass the option after the comma down to the +linker, thus invoking the special behaviour of the ld.hugetblfs script. This +can optionally be set in the LDFLAGS environment variable. + +If you use a compiler other than gcc, you will need to consult its +documentation to see how to convince it to invoke ld.hugetlbfs in +place of the system linker. + + Running the application: + ------------------------ + +The specially-linked application needs the libhugetlbfs library, so +you might need to set the LD_LIBRARY_PATH environment variable so the +application can locate libhugetlbfs.so. Depending on the method used to link +the application, the HUGETLB_ELFMAP environment variable can be used to control +how hugepages will be used. + + When using --hugetlbfs-link: + ---------------------------- + +The custom linker script determines which segments may be remapped into +hugepages and this remapping will occur by default. The following setting will +disable remapping entirely: + + HUGETLB_ELFMAP=no + + When using --hugetlbfs-align: + ----------------------------- + +This method of linking an application permits greater flexibility at runtime. +Using HUGETLB_ELFMAP, it is possible to control which program segments are +placed in hugepages. The following four settings will cause the indicated +segments to be placed in hugepages: + + HUGETLB_ELFMAP=R Read-only segments (text) + HUGETLB_ELFMAP=W Writable segments (data/BSS) + HUGETLB_ELFMAP=RW All segments (text/data/BSS) + HUGETLB_ELFMAP=no No segments + +It is possible to select specific huge page sizes for read-only and writable +segments by using the following advanced syntax: + + HUGETLB_ELFMAP=[R[=<pagesize>]:[W[=<pagesize>]] + +For example: + + Place read-only segments into 64k pages and writable into 16M pages + HUGETLB_ELFMAP=R=64k:W=16M + + Use the default for read-only segments, 1G pages for writable segments + HUGETLB_ELFMAP=R:W=1G + + Use 16M pages for writable segments only + HUGETLB_ELFMAP=W=16M + + Default remapping behavior: + --------------------------- + +If --hugetlbfs-link was used to link an application, the chosen remapping mode +is saved in the binary and becomes the default behavior. Setting +HUGETLB_ELFMAP=no will disable all remapping and is the only way to modify the +default behavior. + +For applications linked with --hugetlbfs-align, the default behavior is to not +remap any segments into huge pages. To set or display the default remapping +mode for a binary, the included hugeedit command can be used: + +hugeedit [options] target-executable + options: + --text,--data Remap the specified segment into huge pages by default + --disable Do not remap any segments by default + +When target-executable is the only argument, hugeedit will display the default +remapping mode without making any modifications. + +When a binary is remapped according to its default remapping policy, the +system default huge page size will be used. + + Environment variables: + ---------------------- + +There are a number of private environment variables which can affect +libhugetlbfs: + HUGETLB_DEFAULT_PAGE_SIZE + Override the system default huge page size for all uses + except hugetlb-backed shared memory + + HUGETLB_RESTRICT_EXE + By default, libhugetlbfs will act on any program that it + is loaded with, either via LD_PRELOAD or by explicitly + linking with -lhugetlbfs. + + There are situations in which it is desirable to restrict + libhugetlbfs' actions to specific programs. For example, + some ISV applications are wrapped in a series of scripts + that invoke bash, python, and/or perl. It is more + convenient to set the environment variables related + to libhugetlbfs before invoking the wrapper scripts, + yet this has the unintended and undesirable consequence + of causing the script interpreters to use and consume + hugepages. There is no obvious benefit to causing the + script interpreters to use hugepages, and there is a + clear disadvantage: fewer hugepages are available to + the actual application. + + To address this scenario, set HUGETLB_RESTRICT_EXE to a + colon-separated list of programs to which the other + libhugetlbfs environment variables should apply. (If + not set, libhugetlbfs will attempt to apply the requested + actions to all programs.) For example, + + HUGETLB_RESTRICT_EXE="hpcc:long_hpcc" + + will restrict libhugetlbfs' actions to programs named + /home/fred/hpcc and /bench/long_hpcc but not /usr/hpcc_no. + + HUGETLB_ELFMAP + Control or disable segment remapping (see above) + + HUGETLB_MINIMAL_COPY + If equal to "no", the entire segment will be copied; + otherwise, only the necessary parts will be, which can + be much more efficient (default) + + HUGETLB_FORCE_ELFMAP + Explained in "Partial segment remapping" + + HUGETLB_MORECORE + HUGETLB_MORECORE_HEAPBASE + HUGETLB_NO_PREFAULT + Explained in "Using hugepages for malloc() + (morecore)" + + HUGETLB_VERBOSE + Specify the verbosity level of debugging output from 1 + to 99 (default is 1) + HUGETLB_PATH + Specify the path to the hugetlbfs mount point + HUGETLB_SHARE + Explained in "Sharing remapped segments" + HUGETLB_DEBUG + Set to 1 if an application segfaults. Gives very detailed output + and runs extra diagnostics. + + Sharing remapped segments: + -------------------------- + +By default, when libhugetlbfs uses anonymous, unlinked hugetlbfs files +to store remapped program segment data. This means that if the same +program is started multiple times using hugepage segments, multiple +huge pages will be used to store the same program data. + +The reduce this wastage, libugetlbfs can be instructed to allow +sharing segments between multiple invocations of a program. To do +this, you must set the HUGETLB_SHARE variable must be set for all the +processes in question. This variable has two possible values: + anything but 1: the default, indicates no segments should be shared + 1: indicates that read-only segments (i.e. the program text, +in most cases) should be shared, read-write segments (data and bss) +will not be shared. + +If the HUGETLB_MINIMAL_COPY variable is set for any program using +shared segments, it must be set to the same value for all invocations +of that program. + +Segment sharing is implemented by creating persistent files in a +hugetlbfs containing the necessary segment data. By default, these +files are stored in a subdirectory of the first located hugetlbfs +filesystem, named 'elflink-uid-XXX' where XXX is the uid of the +process using sharing. This directory must be owned by the uid in +question, and have mode 0700. If it doesn't exist, libhugetlbfs will +create it automatically. This means that (by default) separate +invocations of the same program by different users will not share huge +pages. + +The location for storing the hugetlbfs page files can be changed by +setting the HUGETLB_SHARE_PATH environment variable. If set, this +variable must contain the path of an accessible, already created +directory located in a hugetlbfs filesystem. The owner and mode of +this directory are not checked, so this method can be used to allow +processes of multiple uids to share huge pages. IMPORTANT SECURITY +NOTE: any process sharing hugepages can insert arbitrary executable +code into any other process sharing hugepages in the same directory. +Therefore, when using HUGETLB_SHARE_PATH, the directory created *must* +allow access only to a set of uids who are mutually trusted. + +The files created in hugetlbfs for sharing are persistent, and must be +manually deleted to free the hugepages in question. Future versions +of libhugetlbfs should include tools and scripts to automate this +cleanup. + + Partial segment remapping + ------------------------- + +libhugetlbfs has limited support for remapping a normal, non-relinked +binary's data, text and BSS into hugepages. To enable this feature, +HUGETLB_FORCE_ELFMAP must be set to "yes". + +Partial segment remapping is not guaranteed to work. Most importantly, a +binary's segments must be large enough even when not relinked by +libhugetlbfs: + + architecture address minimum segment size + ------------ ------- -------------------- + i386, x86_64 all hugepage size + ppc32 all 256M + ppc64 0-4G 256M + ppc64 4G-1T 1020G + ppc64 1T+ 1T + +The raw size, though, is not sufficient to indicate if the code will +succeed, due to alignment. Since the binary is not relinked, however, +this is relatively straightforward to 'test and see'. + +NOTE: You must use LD_PRELOAD to load libhugetlbfs.so when using +partial remapping. + + +Examples +======== + +Example 1: Application Developer +--------------------------------- + +To have a program use hugepages, complete the following steps: + +1. Make sure you are working with kernel 2.6.16 or greater. + +2. Modify the build procedure so your application is linked against +libhugetlbfs. + +For the remapping, you link against the library with the appropriate +linker script (if necessary or desired). Linking against the library +should result in transparent usage of hugepages. + +Example 2: End Users and System Administrators +----------------------------------------------- + +To have an application use libhugetlbfs, complete the following steps: + +1. Make sure you are using kernel 2.6.16. + +2. Make sure the library is in the path, which you can set with the +LD_LIBRARY_PATH environment variable. You might need to set other +environment variables, including LD_PRELOAD as described above. + + +Troubleshooting +=============== + +The library has a certain amount of debugging code built in, which can +be controlled with the environment variable HUGETLB_VERBOSE. By +default the debug level is "1" which means the library will only print +relatively serious error messages. Setting HUGETLB_VERBOSE=2 or +higher will enable more debug messages (at present 2 is the highest +debug level, but that may change). Setting HUGETLB_VERBOSE=0 will +silence the library completely, even in the case of errors - the only +exception is in cases where the library has to abort(), which can +happen if something goes wrong in the middle of unmapping and +remapping segments for the text/data/bss feature. + +If an application fails to run, set the environment variable HUGETLB_DEBUG +to 1. This causes additional diagnostics to be run. This information should +be included when sending bug reports to the libhugetlbfs team. + +Specific Scenarios: +------------------- + +ISSUE: When using the --hugetlbfs-align or -zmax-page-size link options, the + linker complains about truncated relocations and the build fails. + +TRY: Compile the program with the --relax linker option. Either add + -Wl,--relax to CFLAGS or --relax to LDFLAGS. + +ISSUE: When using the xB linker script with a 32 bit binary on an x86 host with + NX support enabled, the binary segfaults. + +TRY: Recompiling with the --hugetlbfs-align options and use the new relinking + method or booting your kernel with noexec32=off. + + +Trademarks +========== + +This work represents the view of the author and does not necessarily +represent the view of IBM. + +PowerPC is a registered trademark of International Business Machines +Corporation in the United States, other countries, or both. Linux is +a trademark of Linus Torvalds in the United States, other countries, +or both. diff --git a/default/libhugetlbfs/libhugetlbfs/LGPL-2.1 b/default/libhugetlbfs/libhugetlbfs/LGPL-2.1 new file mode 100644 index 0000000..2d2d780 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/LGPL-2.1 @@ -0,0 +1,510 @@ + + GNU LESSER GENERAL PUBLIC LICENSE + Version 2.1, February 1999 + + Copyright (C) 1991, 1999 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + +[This is the first released version of the Lesser GPL. It also counts + as the successor of the GNU Library Public License, version 2, hence + the version number 2.1.] + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +Licenses are intended to guarantee your freedom to share and change +free software--to make sure the software is free for all its users. + + This license, the Lesser General Public License, applies to some +specially designated software packages--typically libraries--of the +Free Software Foundation and other authors who decide to use it. You +can use it too, but we suggest you first think carefully about whether +this license or the ordinary General Public License is the better +strategy to use in any particular case, based on the explanations +below. + + When we speak of free software, we are referring to freedom of use, +not price. Our General Public Licenses are designed to make sure that +you have the freedom to distribute copies of free software (and charge +for this service if you wish); that you receive source code or can get +it if you want it; that you can change the software and use pieces of +it in new free programs; and that you are informed that you can do +these things. + + To protect your rights, we need to make restrictions that forbid +distributors to deny you these rights or to ask you to surrender these +rights. These restrictions translate to certain responsibilities for +you if you distribute copies of the library or if you modify it. + + For example, if you distribute copies of the library, whether gratis +or for a fee, you must give the recipients all the rights that we gave +you. You must make sure that they, too, receive or can get the source +code. If you link other code with the library, you must provide +complete object files to the recipients, so that they can relink them +with the library after making changes to the library and recompiling +it. And you must show them these terms so they know their rights. + + We protect your rights with a two-step method: (1) we copyright the +library, and (2) we offer you this license, which gives you legal +permission to copy, distribute and/or modify the library. + + To protect each distributor, we want to make it very clear that +there is no warranty for the free library. Also, if the library is +modified by someone else and passed on, the recipients should know +that what they have is not the original version, so that the original +author's reputation will not be affected by problems that might be +introduced by others. + + Finally, software patents pose a constant threat to the existence of +any free program. We wish to make sure that a company cannot +effectively restrict the users of a free program by obtaining a +restrictive license from a patent holder. Therefore, we insist that +any patent license obtained for a version of the library must be +consistent with the full freedom of use specified in this license. + + Most GNU software, including some libraries, is covered by the +ordinary GNU General Public License. This license, the GNU Lesser +General Public License, applies to certain designated libraries, and +is quite different from the ordinary General Public License. We use +this license for certain libraries in order to permit linking those +libraries into non-free programs. + + When a program is linked with a library, whether statically or using +a shared library, the combination of the two is legally speaking a +combined work, a derivative of the original library. The ordinary +General Public License therefore permits such linking only if the +entire combination fits its criteria of freedom. The Lesser General +Public License permits more lax criteria for linking other code with +the library. + + We call this license the "Lesser" General Public License because it +does Less to protect the user's freedom than the ordinary General +Public License. It also provides other free software developers Less +of an advantage over competing non-free programs. These disadvantages +are the reason we use the ordinary General Public License for many +libraries. However, the Lesser license provides advantages in certain +special circumstances. + + For example, on rare occasions, there may be a special need to +encourage the widest possible use of a certain library, so that it +becomes a de-facto standard. To achieve this, non-free programs must +be allowed to use the library. A more frequent case is that a free +library does the same job as widely used non-free libraries. In this +case, there is little to gain by limiting the free library to free +software only, so we use the Lesser General Public License. + + In other cases, permission to use a particular library in non-free +programs enables a greater number of people to use a large body of +free software. For example, permission to use the GNU C Library in +non-free programs enables many more people to use the whole GNU +operating system, as well as its variant, the GNU/Linux operating +system. + + Although the Lesser General Public License is Less protective of the +users' freedom, it does ensure that the user of a program that is +linked with the Library has the freedom and the wherewithal to run +that program using a modified version of the Library. + + The precise terms and conditions for copying, distribution and +modification follow. Pay close attention to the difference between a +"work based on the library" and a "work that uses the library". The +former contains code derived from the library, whereas the latter must +be combined with the library in order to run. + + GNU LESSER GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License Agreement applies to any software library or other +program which contains a notice placed by the copyright holder or +other authorized party saying it may be distributed under the terms of +this Lesser General Public License (also called "this License"). +Each licensee is addressed as "you". + + A "library" means a collection of software functions and/or data +prepared so as to be conveniently linked with application programs +(which use some of those functions and data) to form executables. + + The "Library", below, refers to any such software library or work +which has been distributed under these terms. A "work based on the +Library" means either the Library or any derivative work under +copyright law: that is to say, a work containing the Library or a +portion of it, either verbatim or with modifications and/or translated +straightforwardly into another language. (Hereinafter, translation is +included without limitation in the term "modification".) + + "Source code" for a work means the preferred form of the work for +making modifications to it. For a library, complete source code means +all the source code for all modules it contains, plus any associated +interface definition files, plus the scripts used to control +compilation and installation of the library. + + Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running a program using the Library is not restricted, and output from +such a program is covered only if its contents constitute a work based +on the Library (independent of the use of the Library in a tool for +writing it). Whether that is true depends on what the Library does +and what the program that uses the Library does. + + 1. You may copy and distribute verbatim copies of the Library's +complete source code as you receive it, in any medium, provided that +you conspicuously and appropriately publish on each copy an +appropriate copyright notice and disclaimer of warranty; keep intact +all the notices that refer to this License and to the absence of any +warranty; and distribute a copy of this License along with the +Library. + + You may charge a fee for the physical act of transferring a copy, +and you may at your option offer warranty protection in exchange for a +fee. + + 2. You may modify your copy or copies of the Library or any portion +of it, thus forming a work based on the Library, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) The modified work must itself be a software library. + + b) You must cause the files modified to carry prominent notices + stating that you changed the files and the date of any change. + + c) You must cause the whole of the work to be licensed at no + charge to all third parties under the terms of this License. + + d) If a facility in the modified Library refers to a function or a + table of data to be supplied by an application program that uses + the facility, other than as an argument passed when the facility + is invoked, then you must make a good faith effort to ensure that, + in the event an application does not supply such function or + table, the facility still operates, and performs whatever part of + its purpose remains meaningful. + + (For example, a function in a library to compute square roots has + a purpose that is entirely well-defined independent of the + application. Therefore, Subsection 2d requires that any + application-supplied function or table used by this function must + be optional: if the application does not supply it, the square + root function must still compute square roots.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Library, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Library, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote +it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Library. + +In addition, mere aggregation of another work not based on the Library +with the Library (or with a work based on the Library) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may opt to apply the terms of the ordinary GNU General Public +License instead of this License to a given copy of the Library. To do +this, you must alter all the notices that refer to this License, so +that they refer to the ordinary GNU General Public License, version 2, +instead of to this License. (If a newer version than version 2 of the +ordinary GNU General Public License has appeared, then you can specify +that version instead if you wish.) Do not make any other change in +these notices. + + Once this change is made in a given copy, it is irreversible for +that copy, so the ordinary GNU General Public License applies to all +subsequent copies and derivative works made from that copy. + + This option is useful when you wish to copy part of the code of +the Library into a program that is not a library. + + 4. You may copy and distribute the Library (or a portion or +derivative of it, under Section 2) in object code or executable form +under the terms of Sections 1 and 2 above provided that you accompany +it with the complete corresponding machine-readable source code, which +must be distributed under the terms of Sections 1 and 2 above on a +medium customarily used for software interchange. + + If distribution of object code is made by offering access to copy +from a designated place, then offering equivalent access to copy the +source code from the same place satisfies the requirement to +distribute the source code, even though third parties are not +compelled to copy the source along with the object code. + + 5. A program that contains no derivative of any portion of the +Library, but is designed to work with the Library by being compiled or +linked with it, is called a "work that uses the Library". Such a +work, in isolation, is not a derivative work of the Library, and +therefore falls outside the scope of this License. + + However, linking a "work that uses the Library" with the Library +creates an executable that is a derivative of the Library (because it +contains portions of the Library), rather than a "work that uses the +library". The executable is therefore covered by this License. +Section 6 states terms for distribution of such executables. + + When a "work that uses the Library" uses material from a header file +that is part of the Library, the object code for the work may be a +derivative work of the Library even though the source code is not. +Whether this is true is especially significant if the work can be +linked without the Library, or if the work is itself a library. The +threshold for this to be true is not precisely defined by law. + + If such an object file uses only numerical parameters, data +structure layouts and accessors, and small macros and small inline +functions (ten lines or less in length), then the use of the object +file is unrestricted, regardless of whether it is legally a derivative +work. (Executables containing this object code plus portions of the +Library will still fall under Section 6.) + + Otherwise, if the work is a derivative of the Library, you may +distribute the object code for the work under the terms of Section 6. +Any executables containing that work also fall under Section 6, +whether or not they are linked directly with the Library itself. + + 6. As an exception to the Sections above, you may also combine or +link a "work that uses the Library" with the Library to produce a +work containing portions of the Library, and distribute that work +under terms of your choice, provided that the terms permit +modification of the work for the customer's own use and reverse +engineering for debugging such modifications. + + You must give prominent notice with each copy of the work that the +Library is used in it and that the Library and its use are covered by +this License. You must supply a copy of this License. If the work +during execution displays copyright notices, you must include the +copyright notice for the Library among them, as well as a reference +directing the user to the copy of this License. Also, you must do one +of these things: + + a) Accompany the work with the complete corresponding + machine-readable source code for the Library including whatever + changes were used in the work (which must be distributed under + Sections 1 and 2 above); and, if the work is an executable linked + with the Library, with the complete machine-readable "work that + uses the Library", as object code and/or source code, so that the + user can modify the Library and then relink to produce a modified + executable containing the modified Library. (It is understood + that the user who changes the contents of definitions files in the + Library will not necessarily be able to recompile the application + to use the modified definitions.) + + b) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (1) uses at run time a + copy of the library already present on the user's computer system, + rather than copying library functions into the executable, and (2) + will operate properly with a modified version of the library, if + the user installs one, as long as the modified version is + interface-compatible with the version that the work was made with. + + c) Accompany the work with a written offer, valid for at least + three years, to give the same user the materials specified in + Subsection 6a, above, for a charge no more than the cost of + performing this distribution. + + d) If distribution of the work is made by offering access to copy + from a designated place, offer equivalent access to copy the above + specified materials from the same place. + + e) Verify that the user has already received a copy of these + materials or that you have already sent this user a copy. + + For an executable, the required form of the "work that uses the +Library" must include any data and utility programs needed for +reproducing the executable from it. However, as a special exception, +the materials to be distributed need not include anything that is +normally distributed (in either source or binary form) with the major +components (compiler, kernel, and so on) of the operating system on +which the executable runs, unless that component itself accompanies +the executable. + + It may happen that this requirement contradicts the license +restrictions of other proprietary libraries that do not normally +accompany the operating system. Such a contradiction means you cannot +use both them and the Library together in an executable that you +distribute. + + 7. You may place library facilities that are a work based on the +Library side-by-side in a single library together with other library +facilities not covered by this License, and distribute such a combined +library, provided that the separate distribution of the work based on +the Library and of the other library facilities is otherwise +permitted, and provided that you do these two things: + + a) Accompany the combined library with a copy of the same work + based on the Library, uncombined with any other library + facilities. This must be distributed under the terms of the + Sections above. + + b) Give prominent notice with the combined library of the fact + that part of it is a work based on the Library, and explaining + where to find the accompanying uncombined form of the same work. + + 8. You may not copy, modify, sublicense, link with, or distribute +the Library except as expressly provided under this License. Any +attempt otherwise to copy, modify, sublicense, link with, or +distribute the Library is void, and will automatically terminate your +rights under this License. However, parties who have received copies, +or rights, from you under this License will not have their licenses +terminated so long as such parties remain in full compliance. + + 9. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Library or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Library (or any work based on the +Library), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Library or works based on it. + + 10. Each time you redistribute the Library (or any work based on the +Library), the recipient automatically receives a license from the +original licensor to copy, distribute, link with or modify the Library +subject to these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties with +this License. + + 11. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Library at all. For example, if a patent +license would not permit royalty-free redistribution of the Library by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Library. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply, and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 12. If the distribution and/or use of the Library is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Library under this License +may add an explicit geographical distribution limitation excluding those +countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 13. The Free Software Foundation may publish revised and/or new +versions of the Lesser General Public License from time to time. +Such new versions will be similar in spirit to the present version, +but may differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Library +specifies a version number of this License which applies to it and +"any later version", you have the option of following the terms and +conditions either of that version or of any later version published by +the Free Software Foundation. If the Library does not specify a +license version number, you may choose any version ever published by +the Free Software Foundation. + + 14. If you wish to incorporate parts of the Library into other free +programs whose distribution conditions are incompatible with these, +write to the author to ask for permission. For software which is +copyrighted by the Free Software Foundation, write to the Free +Software Foundation; we sometimes make exceptions for this. Our +decision will be guided by the two goals of preserving the free status +of all derivatives of our free software and of promoting the sharing +and reuse of software generally. + + NO WARRANTY + + 15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO +WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW. +EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR +OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY +KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE +LIBRARY IS WITH YOU. SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME +THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN +WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY +AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU +FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR +CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE +LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING +RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A +FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF +SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Libraries + + If you develop a new library, and you want it to be of the greatest +possible use to the public, we recommend making it free software that +everyone can redistribute and change. You can do so by permitting +redistribution under these terms (or, alternatively, under the terms +of the ordinary General Public License). + + To apply these terms, attach the following notices to the library. +It is safest to attach them to the start of each source file to most +effectively convey the exclusion of warranty; and each file should +have at least the "copyright" line and a pointer to where the full +notice is found. + + + <one line to give the library's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +Also add information on how to contact you by electronic and paper mail. + +You should also get your employer (if you work as a programmer) or +your school, if any, to sign a "copyright disclaimer" for the library, +if necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the + library `Frob' (a library for tweaking knobs) written by James + Random Hacker. + + <signature of Ty Coon>, 1 April 1990 + Ty Coon, President of Vice + +That's all there is to it! + + diff --git a/default/libhugetlbfs/libhugetlbfs/Makefile b/default/libhugetlbfs/libhugetlbfs/Makefile new file mode 100644 index 0000000..d33e972 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/Makefile @@ -0,0 +1,437 @@ +PREFIX ?= /usr/local +EXEDIR ?= /bin + +LIBOBJS = hugeutils.o version.o init.o morecore.o debug.o alloc.o shm.o kernel-features.o +LIBPUOBJS = init_privutils.o debug.o hugeutils.o kernel-features.o +INSTALL_OBJ_LIBS = libhugetlbfs.so libhugetlbfs_privutils.so +BIN_OBJ_DIR=obj +PM_OBJ_DIR=TLBC +INSTALL_BIN = hugectl hugeedit hugeadm pagesize +INSTALL_SCRIPT = cpupcstat oprofile_map_events.pl oprofile_start.sh +INSTALL_HELPER = huge_page_setup_helper.py +INSTALL_PERLMOD = DataCollect.pm OpCollect.pm PerfCollect.pm Report.pm +INSTALL_HEADERS = hugetlbfs.h +INSTALL_MAN1 = pagesize.1 +INSTALL_MAN3 = get_huge_pages.3 get_hugepage_region.3 \ + gethugepagesizes.3 getpagesizes.3 +INSTALL_MAN7 = libhugetlbfs.7 +INSTALL_MAN8 = hugectl.8 hugeedit.8 hugeadm.8 cpupcstat.8 +LDSCRIPT_TYPES = B BDT +LDSCRIPT_DIST_ELF = elf32ppclinux elf64ppc elf_i386 elf_x86_64 +INSTALL_OBJSCRIPT = ld.hugetlbfs +VERSION=version.h +SOURCE = $(shell find . -maxdepth 1 ! -name version.h -a -name '*.[h]') +SOURCE += *.c *.lds Makefile +NODEPTARGETS=<version.h> <clean> + +INSTALL = install + +LDFLAGS += -Wl,-z,noexecstack -ldl +CFLAGS ?= -O2 -g +CFLAGS += -Wall -fPIC +CPPFLAGS += -D__LIBHUGETLBFS__ + +ARCH = $(shell uname -m | sed -e s/i.86/i386/) + +ifeq ($(ARCH),ppc64) +CC64 = gcc -m64 +ELF64 = elf64ppc +TMPLIB64 = lib64 +TMPLIB32 = lib +ifneq ($(BUILDTYPE),NATIVEONLY) +CC32 = gcc -m32 +ELF32 = elf32ppclinux +endif +else +ifeq ($(ARCH),ppc) +CC32 = gcc -m32 +ELF32 = elf32ppclinux +TMPLIB32 = lib +else +ifeq ($(ARCH),i386) +CC32 = gcc +ELF32 = elf_i386 +TMPLIB32 = lib +else +ifeq ($(ARCH),x86_64) +CC64 = gcc -m64 +ELF64 = elf_x86_64 +TMPLIB64 = lib64 +TMPLIB32 = lib +ifneq ($(BUILDTYPE),NATIVEONLY) +CC32 = gcc -m32 +ELF32 = elf_i386 +endif +else +ifeq ($(ARCH),ia64) +CC64 = gcc +TMPLIB64 = lib64 +CFLAGS += -DNO_ELFLINK +else +ifeq ($(ARCH),sparc64) +CC64 = gcc -m64 +TMPLIB64 = lib64 +CFLAGS += -DNO_ELFLINK +else +ifeq ($(ARCH),s390x) +CC64 = gcc -m64 +CC32 = gcc -m31 +TMPLIB64 = lib64 +TMPLIB32 = lib +CFLAGS += -DNO_ELFLINK +else +$(error "Unrecognized architecture ($(ARCH))") +endif +endif +endif +endif +endif +endif +endif + +ifdef CC32 +OBJDIRS += obj32 +endif +ifdef CC64 +OBJDIRS += obj64 +endif + +ifdef CC64 +CCBIN = $(CC64) +else +CCBIN = $(CC32) +endif + +ifdef ELF32 +LIBOBJS32 = obj32/elflink.o obj32/sys-$(ELF32).o +endif +ifdef ELF64 +LIBOBJS64 = obj64/elflink.o obj64/sys-$(ELF64).o +endif +ifeq ($(ELF32),elf32ppclinux) +LIBOBJS32 += obj32/$(ELF32).o +endif +ifeq ($(ELF64),elf64ppc) +LIBOBJS64 += obj64/$(ELF64).o +endif +LIBOBJS32 += $(LIBOBJS:%=obj32/%) +LIBOBJS64 += $(LIBOBJS:%=obj64/%) + +ifeq ($(LIB32),) +LIB32 = $(TMPLIB32) +endif + +ifdef TMPLIB64 +ifeq ($(LIB64),) +LIB64 = $(TMPLIB64) +endif +endif + +# If TMPLIB64 is set, then sure we are not resolving LIB32 and LIB64 to the +# same place +ifdef TMPLIB64 + +REALLIB32 = $(realpath $(PREFIX)/$(LIB32)) +REALLIB64 = $(realpath $(PREFIX)/$(LIB64)) +ifneq ($(realpath $(PREFIX)),) +ifeq ($(REALLIB32),$(REALLIB64)) +$(error LIB32 ($(PREFIX)/$(LIB32) to $(REALLIB32)) and LIB64 ($(PREFIX)/$(LIB64) to $(REALLIB64)) are resolving to the same place. Manually specify LIB32 and LIB64. e.g. make PREFIX=$(PREFIX) LIB32=lib32 LIB64=lib64) +endif +endif + +endif + +HEADERDIR = $(PREFIX)/include +LIBDIR32 = $(PREFIX)/$(LIB32) +LIBDIR64 = $(PREFIX)/$(LIB64) +LDSCRIPTDIR = $(PREFIX)/share/libhugetlbfs/ldscripts +BINDIR = $(PREFIX)/share/libhugetlbfs +EXEDIR = $(PREFIX)/bin +DOCDIR = $(PREFIX)/share/doc/libhugetlbfs +PMDIR = $(PREFIX)/lib/perl5/TLBC +MANDIR1 = $(PREFIX)/share/man/man1 +MANDIR3 = $(PREFIX)/share/man/man3 +MANDIR7 = $(PREFIX)/share/man/man7 +MANDIR8 = $(PREFIX)/share/man/man8 + +ifdef LIB32 +LIBPATHS += -DLIB32='"$(LIB32)"' -DLIBDIR32='"$(LIBDIR32)"' +endif +ifdef LIB64 +LIBPATHS += -DLIB64='"$(LIB64)"' -DLIBDIR64='"$(LIBDIR64)"' +endif + +EXTRA_DIST = \ + README \ + HOWTO \ + LGPL-2.1 + +INSTALL_LDSCRIPTS = $(foreach type,$(LDSCRIPT_TYPES),$(LDSCRIPT_DIST_ELF:%=%.x$(type))) + +ifdef V +VECHO = : +else +VECHO = echo " " +ARFLAGS = rc +.SILENT: +endif + +DEPFILES = $(LIBOBJS:%.o=%.d) + +export ARCH +export OBJDIRS +export CC32 +export CC64 +export ELF32 +export ELF64 +export LIBDIR32 +export LIBDIR64 + +all: libs tests tools + +.PHONY: tests libs + +libs: $(foreach file,$(INSTALL_OBJ_LIBS),$(OBJDIRS:%=%/$(file))) $(BIN_OBJ_DIR)/libhugetlbfs_privutils.a + +tests: libs # Force make to build the library first +tests: tests/all + +tests/%: libs + $(MAKE) -C tests $* + +tools: $(foreach file,$(INSTALL_BIN),$(BIN_OBJ_DIR)/$(file)) + +check: all + cd tests; ./run_tests.py + +checkv: all + cd tests; ./run_tests.py -vV + +func: all + cd tests; ./run_tests.py -t func + +funcv: all + cd tests; ./run_tests.py -t func -vV + +stress: all + cd tests; ./run_tests.py -t stress + +stressv: all + cd tests; ./run_tests.py -t stress -vV + +# Don't want to remake objects just 'cos the directory timestamp changes +$(OBJDIRS): %: + @mkdir -p $@ + +# <Version handling> +$(VERSION): always + @$(VECHO) VERSION + ./localversion version $(SOURCE) +always: +# </Version handling> + +snapshot: $(VERSION) + +.SECONDARY: + +obj32/%.o: %.c + @$(VECHO) CC32 $@ + @mkdir -p obj32 + $(CC32) $(CPPFLAGS) $(CFLAGS) -o $@ -c $< + +obj64/%.o: %.c + @$(VECHO) CC64 $@ + @mkdir -p obj64 + $(CC64) $(CPPFLAGS) $(CFLAGS) -o $@ -c $< + +obj32/%.o: %.S + @$(VECHO) AS32 $@ + @mkdir -p obj32 + $(CC32) $(CPPFLAGS) -o $@ -c $< + +obj64/%.o: %.S + @$(VECHO) AS64 $@ + @mkdir -p obj64 + $(CC64) $(CPPFLAGS) -o $@ -c $< + +obj32/libhugetlbfs.a: $(LIBOBJS32) + @$(VECHO) AR32 $@ + $(AR) $(ARFLAGS) $@ $^ + +obj64/libhugetlbfs.a: $(LIBOBJS64) + @$(VECHO) AR64 $@ + $(AR) $(ARFLAGS) $@ $^ + +obj32/libhugetlbfs.so: $(LIBOBJS32) + @$(VECHO) LD32 "(shared)" $@ + $(CC32) $(LDFLAGS) -Wl,--version-script=version.lds -Wl,-soname,$(notdir $@) -shared -o $@ $^ $(LDLIBS) + +obj64/libhugetlbfs.so: $(LIBOBJS64) + @$(VECHO) LD64 "(shared)" $@ + $(CC64) $(LDFLAGS) -Wl,--version-script=version.lds -Wl,-soname,$(notdir $@) -shared -o $@ $^ $(LDLIBS) + +#obj32/libhugetlbfs_privutils.a: $(LIBPUOBJS:%=obj32/%) +# @$(VECHO) AR32 $@ +# $(AR) $(ARFLAGS) $@ $^ +# +#obj64/libhugetlbfs_privutils.a: $(LIBPUOBJS:%=obj64/%) +# @$(VECHO) AR64 $@ +# $(AR) $(ARFLAGS) $@ $^ + +$(BIN_OBJ_DIR)/libhugetlbfs_privutils.a: $(LIBPUOBJS:%=$(BIN_OBJ_DIR)/%) + @$(VECHO) ARHOST $@ + $(AR) $(ARFLAGS) $@ $^ + +obj32/libhugetlbfs_privutils.so: $(LIBPUOBJS:%=obj32/%) + @$(VECHO) LD32 "(shared)" $@ + $(CC32) $(LDFLAGS) -Wl,--version-script=privutils.lds -Wl,-soname,$(notdir $@) -shared -o $@ $^ $(LDLIBS) + +obj64/libhugetlbfs_privutils.so: $(LIBPUOBJS:%=obj64/%) + @$(VECHO) LD64 "(shared)" $@ + $(CC64) $(LDFLAGS) -Wl,--version-script=privutils.lds -Wl,-soname,$(notdir $@) -shared -o $@ $^ $(LDLIBS) + +obj32/%.i: %.c + @$(VECHO) CPP $@ + $(CC32) $(CPPFLAGS) -E $< > $@ + +obj64/%.i: %.c + @$(VECHO) CPP $@ + $(CC64) $(CPPFLAGS) -E $< > $@ + +obj32/%.s: %.c + @$(VECHO) CC32 -S $@ + $(CC32) $(CPPFLAGS) $(CFLAGS) -o $@ -S $< + +obj64/%.s: %.c + @$(VECHO) CC64 -S $@ + $(CC64) $(CPPFLAGS) $(CFLAGS) -o $@ -S $< + +$(BIN_OBJ_DIR)/%.o: %.c + @$(VECHO) CCHOST $@ + @mkdir -p $(BIN_OBJ_DIR) + $(CCBIN) $(CPPFLAGS) $(CFLAGS) $(LIBPATHS) -o $@ -c $< + +$(BIN_OBJ_DIR)/hugectl: $(BIN_OBJ_DIR)/hugectl.o + @$(VECHO) LDHOST $@ + mkdir -p $(BIN_OBJ_DIR) + $(CCBIN) $(CPPFLAGS) $(CFLAGS) -o $@ $^ + +$(BIN_OBJ_DIR)/hugeedit: $(BIN_OBJ_DIR)/hugeedit.o + @$(VECHO) LDHOST $@ + mkdir -p $(BIN_OBJ_DIR) + $(CCBIN) $(CPPFLAGS) $(CFLAGS) $(LIBPATHS) -o $@ $^ + +HUGEADM_OBJ=hugeadm.o libhugetlbfs_privutils.a +$(BIN_OBJ_DIR)/hugeadm: $(foreach file,$(HUGEADM_OBJ),$(BIN_OBJ_DIR)/$(file)) + @$(VECHO) LDHOST $@ + mkdir -p $(BIN_OBJ_DIR) + $(CCBIN) $(CPPFLAGS) $(CFLAGS) $(LIBPATHS) -o $@ $^ + +PAGESIZE_OBJ=pagesize.o libhugetlbfs_privutils.a +$(BIN_OBJ_DIR)/pagesize: $(foreach file,$(PAGESIZE_OBJ),$(BIN_OBJ_DIR)/$(file)) + @$(VECHO) LDHOST $@ + mkdir -p $(BIN_OBJ_DIR) + $(CCBIN) $(CPPFLAGS) $(CFLAGS) $(LIBPATHS) -o $@ $^ + +clean: + @$(VECHO) CLEAN + rm -f *~ *.o *.so *.a *.d *.i core a.out $(VERSION) + rm -rf obj* + rm -f ldscripts/*~ + rm -f libhugetlbfs-sock + $(MAKE) -C tests clean + +%.d: %.c $(VERSION) + @$(CC) $(CPPFLAGS) -MM -MT "$(foreach DIR,$(OBJDIRS),$(DIR)/$*.o) $@" $< > $@ + +# Workaround: Don't build dependencies for certain targets +# When the include below is executed, make will use the %.d target above to +# generate missing files. For certain targets (clean, version.h, etc) we don't +# need or want these dependency files, so don't include them in this case. +ifeq (,$(findstring <$(MAKECMDGOALS)>,$(NODEPTARGETS))) +-include $(DEPFILES) +endif + +obj32/install: + @$(VECHO) INSTALL-LIB32 $(LIBDIR32) + $(INSTALL) -d $(DESTDIR)$(LIBDIR32) + $(INSTALL) $(INSTALL_OBJ_LIBS:%=obj32/%) $(DESTDIR)$(LIBDIR32) + +obj64/install: + @$(VECHO) INSTALL-LIB64 $(LIBDIR64) + $(INSTALL) -d $(DESTDIR)$(LIBDIR64) + $(INSTALL) $(INSTALL_OBJ_LIBS:%=obj64/%) $(DESTDIR)$(LIBDIR64) + +objscript.%: % + @$(VECHO) OBJSCRIPT $* + sed "s!### SET DEFAULT LDSCRIPT PATH HERE ###!HUGETLB_LDSCRIPT_PATH=$(LDSCRIPTDIR)!" < $< > $@ + +install-libs: libs $(OBJDIRS:%=%/install) $(INSTALL_OBJSCRIPT:%=objscript.%) + $(INSTALL) -d $(DESTDIR)$(HEADERDIR) + $(INSTALL) -d $(DESTDIR)$(LDSCRIPTDIR) + $(INSTALL) -d $(DESTDIR)$(BINDIR) + $(INSTALL) -m 644 -t $(DESTDIR)$(HEADERDIR) $(INSTALL_HEADERS) + $(INSTALL) -m 644 $(INSTALL_LDSCRIPTS:%=ldscripts/%) $(DESTDIR)$(LDSCRIPTDIR) + for x in $(INSTALL_OBJSCRIPT); do \ + $(INSTALL) -m 755 objscript.$$x $(DESTDIR)$(BINDIR)/$$x; done + cd $(DESTDIR)$(BINDIR) && ln -sf ld.hugetlbfs ld + +install-man: + @$(VECHO) INSTALL_MAN $(DESTDIR)manX + $(INSTALL) -d $(DESTDIR)$(MANDIR1) + $(INSTALL) -d $(DESTDIR)$(MANDIR3) + $(INSTALL) -d $(DESTDIR)$(MANDIR7) + $(INSTALL) -d $(DESTDIR)$(MANDIR8) + for x in $(INSTALL_MAN1); do \ + $(INSTALL) -m 444 man/$$x $(DESTDIR)$(MANDIR1); \ + gzip -f $(DESTDIR)$(MANDIR1)/$$x; \ + done + for x in $(INSTALL_MAN3); do \ + $(INSTALL) -m 444 man/$$x $(DESTDIR)$(MANDIR3); \ + gzip -f $(DESTDIR)$(MANDIR3)/$$x; \ + done + rm -f $(DESTDIR)$(MANDIR3)/free_huge_pages.3.gz + rm -f $(DESTDIR)$(MANDIR3)/free_hugepage_region.3.gz + ln -s get_huge_pages.3.gz $(DESTDIR)$(MANDIR3)/free_huge_pages.3.gz + ln -s get_hugepage_region.3.gz $(DESTDIR)$(MANDIR3)/free_hugepage_region.3.gz + for x in $(INSTALL_MAN7); do \ + $(INSTALL) -m 444 man/$$x $(DESTDIR)$(MANDIR7); \ + gzip -f $(DESTDIR)$(MANDIR7)/$$x; \ + done + for x in $(INSTALL_MAN8); do \ + $(INSTALL) -m 444 man/$$x $(DESTDIR)$(MANDIR8); \ + gzip -f $(DESTDIR)$(MANDIR8)/$$x; \ + done + +install-bin: + @$(VECHO) INSTALL_BIN $(DESTDIR)$(EXEDIR) + $(INSTALL) -d $(DESTDIR)$(EXEDIR) + for x in $(INSTALL_BIN); do \ + $(INSTALL) -m 755 $(BIN_OBJ_DIR)/$$x $(DESTDIR)$(EXEDIR); done + +install-stat: install-perlmod + @$(VECHO) INSTALL_SCRIPT $(DESTDIR)$(EXEDIR) + $(INSTALL) -d $(DESTDIR)$(EXEDIR) + for x in $(INSTALL_SCRIPT); do \ + $(INSTALL) -m 755 $$x $(DESTDIR)$(EXEDIR); done + +install-perlmod: + @$(VECHO) INSTALL_PERLMOD $(DESTDIR)$(PMDIR) + $(INSTALL) -d $(DESTDIR)$(PMDIR) + for x in $(INSTALL_PERLMOD); do \ + $(INSTALL) -m 644 $(PM_OBJ_DIR)/$$x $(DESTDIR)$(PMDIR); done + +install: install-libs install-bin install-man install-stat + +install-helper: + @$(VECHO) INSTALL_HELPER $(DESTDIR)$(EXEDIR) + $(INSTALL) -d $(DESTDIR)$(EXEDIR) + for x in $(INSTALL_HELPER); do \ + $(INSTALL) -m 755 $$x $(DESTDIR)$(EXEDIR); done + +install-docs: + $(INSTALL) -d $(DESTDIR)$(DOCDIR) + for x in $(EXTRA_DIST); do $(INSTALL) -m 755 $$x $(DESTDIR)$(DOCDIR)/$$x; done + +install-tests: tests install # Force make to build tests and install the library first + ${MAKE} -C tests install DESTDIR=$(DESTDIR) OBJDIRS="$(OBJDIRS)" LIB32=$(LIB32) LIB64=$(LIB64) diff --git a/default/libhugetlbfs/libhugetlbfs/NEWS b/default/libhugetlbfs/libhugetlbfs/NEWS new file mode 100644 index 0000000..5b90c52 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/NEWS @@ -0,0 +1,357 @@ +libhugetlbfs 2.13 "Insert Clever Title Here" +====================================================================== +New Features +* hugeadm can now be used to control Transparent Huge Page tunables +* New morecore mode to better support THP + +Bug Fixes +* Check permissions on hugetlbfs mount point before marking it as available + +Test Suite +* Fix shm tests to use random address instead of fixed, old address failed + on ARM + +libhugetlbfs 2.12 "Serrano" +====================================================================== +New Features +* libhugetlbfs usage can now be restricted to certain binary names +* lihugetlbfs now supports static linking +* hugeadm uses more human readable directory names for mount points + +Bug Fixes +* hugeadm would segfault if specified user was not in passwd, failure in + getpwuid() is now checked + +Test Suite +* Added missing tests to driver script +* Added tests for static linking + +libhugetlbfs 2.11 "Ghost Pepper" +====================================================================== +New Features +* cpupcstat reports time servicing tlb misses when requested +* When supported by the kernel and glibc, MAP_HUGETLB is used + for the heap and to back memory returned by get_huge_pages. + These features can now be used without mounting hugetlbfs + +Bug Fixes +* tlbmiss_cost.sh supresses oprofile errors +* numerous fixes to setup_helper.py +* Corrected usage of hugetlbfs_test_feature return value +* find_mounts now correctly ignores non-hugetlbfs mount points +* When prefaulting pages for get_huge_pages readv was using the fd + for the mapping, this caused the prefault to fail on older libc. + Now /dev/zero is used for all prefaulting + +libhugetlbfs 2.10 "Another Hottie" +====================================================================== +Bug Fixes +* hugeadm now handles pool size deltas properly +* Makefile uses ?= to assign PREFIX and EXEDIR to allow for easier build + modification + +libhugetlbfs 2.9 "Something Spicy" +====================================================================== +New Features +* Add --no-reseve to hugectl to request mmap'd pages are not reserved + for kernels newer than 2.6.34 +* POWER6 now supported by TLB miss cost estimator +* Add --obey-numa-mempol to hugeadm to request static pool pages are + allocated following the process NUMA memory policy + +Test Suite +* Fix gethugepagesizes test case + +libhugetlbfs 2.8 "Scotch Bonnet" +====================================================================== +New Features +* Add switch to let administrator limit new mount points by size or inodes +* cpupcstat now caches the value returned by tlmiss_cost.sh to avoid + rerunning the script + +Bug Fixes +* errno values are saved in get_huge_pages +* tlbmiss_cost.sh patches calibrator to fix round naming collision +* Fixed ALIGN_UP macro for aligning huge page segments +* Fix --create-mounts switch in hugeadm +* Library and helpers are all linked with -z noexecstack + +Test Suite +* run_tests.py detects valid word sizes + +libhugetlbfs 2.7 "Adobo" +====================================================================== +New Features +* When specifying huge page pool sizes with hugeadm, memory sizes can + be used as well as the number of huge pages +* DEFAULT is now a valid huge page pool for resizing, it will adjust + the pool for the default huge page size +* tlbmiss_cost.sh in the contrib/ sub directory will estimate the cost + in CPU cycles of a TLB miss on the arch where it is run +* Add python script which automates huge page pool setup with minimal + input required from user + +Bug Fixes +* The --dry-run switch in hugeadm is now obeyed +* hugeadm now uses unsigned long long for page resizes to avoid + overflow errors +* --set-recommended-shmmax no longer overflows if the number of + available huge pages is bigger than the address space + +Test Suite +* Updated linkhuge_nofd to override proper functions when testing +* run_tests.py can now monitor the pool sizes between tests to help + identify accounting errors +* Add test for mremap bug on architectures with holes in address space + +libhugetlbfs 2.6 "Adovada" +====================================================================== +New Features +* cpupcstat now supports data collection using the perf tool as well as + oprofile +* --explain reports if min_free_kbytes is too small +* add --set-min_free_kbytes to hugeadm + +Bug Fixes +* Admin utils (hugeadm, hugectl, etc) are now built as 64 bit binaries + if possible to support adminstration of larger huge page sizes + +Test Suite +* Suppress ld.hugetlbfs warnings during test suite build +* Make SPECIAL keyword test cross-compile safe +* Test script sets proper rlimits for mlock and stack_grow_into_huge + tests +* Ensure that all elflink tests are run with both HUGETLB_SHARE=0 and + HUGETLB_SHARE=1 + +libhugetlbfs 2.5 "Self Titled" +====================================================================== +New Features +* added --add-ramdisk-swap option to hugeadm to use ramdisks as + temporary swap space for diskless systems +* added --persist option to hugeadm to be used with either --add-*-swap + option. Makes swap added stay until removed or the machine is rebooted +* added cpupcstat script which uses oprofile to monitor tlb miss rate + of a target program + +Bug Fixes +* --add-temp-swap option now takes an optional integer argument that is + the size in number of hugepages to make the swap space + +libhugetlbfs 2.4 "Birdseye" +====================================================================== +New Features +* added --add-temp-swap option to hugeadm to add a swap file for a pool + resize +* added --[enable|disable]-zone-moveable options to hugeadm to control + /proc/sys/vm/hugepages_treat_as_movable + +Bug Fixes +* Fix pool-pages-max processing by using the proper array for its + requests +* Move private reservation check out of morecore setup + +Test Suite +* Added regression tests for leaking reserve count due to madvise and + fadvise and readahead +* Add test for mixed permissions on shm segments +* Wrap tests that can hang the machine to fail is kernel is too old +* Add -f option to run_tests.py to force running of tests that can hang + the machine + +libhugetlbfs 2.3 "NuMex Sunglo" +====================================================================== +New Features +* added --force-preload option to hugectl for backing segments with + 64kb pages on ppc64 when app was not linked with libhugetlbfs +* added --explain swtich to hugadm to give a quick overview of the + system wrt huge pages +* hugeadm warns if min pool size is being adjusted without sufficient + swap space configured +* added --hard switch to ask hugeadm to try multiple times to resize + a huge page pool +* added --create-*-mounts switches to create mount points for hugetlbfs + usable by specific users, groups, or globally + +Bug Fixes +* hugeadm will no longer mount a directory multiple times +* hugeadm adds all new mount points to /etc/mtab + +libhugetlbfs 2.2 "Barkers Extra Hot" +====================================================================== +New Features +* Refactored environment variable parsing to read once and store values +* Add --list-mounts and --list-all-mounts options to hugeadm +* Rework test suite to run for all avaialbe page sizes +* Add --create-mounts for root only, --create-user-mounts, + --create-group-mounts, and --create-global-mounts options to hugeadm +* Add --share-text option to hugectl + +Test Suite Fixes +* Added wrapper to shm-fork and shm-getraw tests that makes runs on + hpage sizes different from default expected failures +* Reworked shmoverride tests to handle new env parsing + +libhugetlbfs 2.1 "NM 64" +====================================================================== +New Features +* Multiple page size support +* Add a more user friendly allocator that handles cache coloring +* Add pagesize utility to display supported page sizes +* Add hugeadm utility for managing hugepage pools +* Add hugectl utility for running programs using hugepages +* Add hugeedit utility for marking segments in aligned binaries for + huge page backing by default +* Deprecated linker linker scripts +* gethugepagesize() and getpagesizes() API added to programatically + discover supported hugepages +* Manual pages for all API functions and utilities +* Allow automatic backing of shared memory segments with hugepages +* huge page regions will no longer prefault for kernels >= 2.6.27 + improving mmap() performance and NUMA layout + +Bug Fixes +* Add missing segment to interp section in linker scripts +* Fix free_hugepage_region to handle segments that fell back to small + pages +* Fix install when lib32 and lib64 resolve to the same place +* Install header files so APIs can be used +* Fix installation paths to make life easier for package maintainers +* Do not export internal symbols unnecessarily +* Prefault regions allocated by direct allocation API on kernels older + than 2.6.27 +* Gracefully fallback to using base pages for text/data when the + hugepage pool is too small +* Fix handling of HUGETLB_SHARE_PATH environment variable +* Relax remapping requirements + +Test suite Fixes +* Added expected failure support +* gethugepagesizes override for getting meminfo +* Increase debug output for tests that fail +* Summarise pass and failure counts + +libhugetlbfs 2.0 "Sandia Red" +====================================================================== + +New Features +* New scriptless relinking for binutils >= 2.17 +* Added direct allocator API for huge pages + +Bug Fixes +* /proc/mounts is parsed line at a time to handle file larger than 4kb +* Read-only mappings use MAP_NORESERVE + +Test suite fixes +* Add test for /proc/mounts file larger than 4kb +* Fix quota test with private reservations +* Output strerror on failure +* linkhuge tests are skipped when known to be broken + +libhugetlbfs 1.3 "Big Jim" +====================================================================== + +New features +* Add HUGETLB_NO_PREFAULT to control prefaulting of huge pages via mlock +* Add "[hostname:pid]" to output messages +* Setup for handling larger huge page sizes e.g. 16G huge pages +* Update for new upstream sysctl +* Add support for hugetlbfs_morecore to shrink the heap + +Bug fixes +* Disable heap shrinking by default to avoid bug in glibc malloc +* Skip elflink calls in setup_libhugetlbfs for IA64 and sparc64 +* Replace gethugepagesize with check_hugepagesize for error checking +* Make morecore argument a long to handle larger page sizes + +Test suite fixes +* Check uid/gid in tests where it matters +* tests: verify there are enough huge pages +* Change tests to read /proc/meminfo +* tests: verify that huge page size isn't too big for the test + +libhugetlbfs 1.2 "Mango Habanero" +====================================================================== + +New features + +* Partial segment remapping. This allows non-relinked binaries to try + to take advantage of libhugetlbfs' segment remapping code. Large + segments are required, especially on Power. This feature is useful + for estimating huge page performance, however full relinking will + still perform better. +* Add extra debugging for binaries that may run out of address space. +* Log library version when HUGETLB_VERBOSE is enabled. +* Beginning support for ia64 and sparc64. +* New test to check handling of misaligned mmap() parameters. + +Bug fixes + +* Fix EH_FRAME segment. Fixes some C++ applications. +* Rework PLT detection to work better on Power. +* Add per-target-arch syscall stubs to the library. These provide + reliable error messages from elflink.c if they occur while the + program segments are unmapped. +* Add proper SONAME to shared libs. +* Makefile respects CFLAGS/LDFLAGS/CPPFLAGS environment variables. +* Make mlock() failure non-fatal. + +Test suite fixes + +* Fix zero_filesize_segment test. +* Fix the icache-hygeine testcase for i386 and x86_64. +* Fix SEGVs in task-size-overrun test. +* Enable stack_grow_into_huge test, previously skipped. +* Fix test_addr_huge() for anon pages. + +libhugetlbfs 1.1 "Carribbean Jerk" +====================================================================== + +This large release brings several performance improvements + +Security + +* Remove the sharing daemon completely and rework the segment sharing + code. Permissions on the hugetlbfs mountpoint are used to enforce + segment sharing. + +Bug fixes + +* Sharing of writable segments is no longer possible, due to address + space randomization on x86_64 (although similar issues could happen on + any architecture). +* minimal_copy detection should work better in this release. + +Trivial but notable changes + +* Testcase updates + +libhugetlbfs 1.0.1 "Spicy Garlic" +====================================================================== + +This small maintenance release brings a security fix, a few minor bug +fixes, plus some documentation and error message updates. + +Security + +* A section on security has been added to the README file +* The hugetlbd daemon socket has been moved from /tmp to /var/run. + This will require the daemon to be run as root, which was previously + just a recommendation. + +Bug fixes + +* Reduce reserved huge pages needed for application start-up +* PPC linker script fixes + +Trivial but notable changes + +* Always install linker scripts for all targets +* Error message updates +* Add documentation on HUGETLB_DEBUG +* Testcase updates + +libhugetlbfs 1.0 +====================================================================== + +* First stable release diff --git a/default/libhugetlbfs/libhugetlbfs/README b/default/libhugetlbfs/libhugetlbfs/README new file mode 100644 index 0000000..d97e308 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/README @@ -0,0 +1,42 @@ +10/03/2006 -- libhugetlbfs-1.0 Released + +After roughly one year in development, version 1.0 of libhugetlbfs is here. +It can be downloaded from SourceForge or the OzLabs mirror: + + http://sourceforge.net/project/showfiles.php?group_id=156936 + http://libhugetlbfs.ozlabs.org/snapshots/ + +After a series of preview releases, we have tested a huge array of the +supported usage scenarios using benchmarks and real HPC applications. +Usability and reliability have greatly improved. But... due to the +incredible diversity of applications that exist, there is bound to be a few +that will not work correctly. + +If using libhugetlbfs makes your application slower: + + * Play around with the different combinations of hugetlb malloc and the + two different supported link types to see which combination works best. + + * Keep in mind that huge pages are a niche performance tweak and are not + suitable for every type of application. They are specifically known to + hurt performance in certain situations. + +If you experience problems: + + * You've already read the HOWTO document, but read through it again. It + is full of hints, notes, warnings, and caveats that we have found over + time. This is the best starting point for a quick resolution to your + issue. + + * Make sure you have enough huge pages allocated. Even if you think you + have enough, try increasing it to a number you know you will not use. + + * Set HUGETLB_VERBOSE=99 and HUGETLB_DEBUG=yes. These options increase + the verbosity of the library and enable extra checking to help diagnose + the problem. + +If the above steps do not help, send as much information about the problem +(including all libhugetlbfs debug output) to +libhugetlbfs-devel@xxxxxxxxxxxxxxxxxxxxx and we'll help out as much as we +can. We will probably ask you to collect things like: straces, +/proc/pid/maps and gdb back traces. diff --git a/default/libhugetlbfs/libhugetlbfs/TLBC/DataCollect.pm b/default/libhugetlbfs/libhugetlbfs/TLBC/DataCollect.pm new file mode 100644 index 0000000..631fe74 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/TLBC/DataCollect.pm @@ -0,0 +1,55 @@ +# +# DataCollect.pm +# +# This module is the base class for DTLB data collection. This class is a +# interface class only, to add a new collection method inherit from this +# class and use it in calc_missrate.pl +# Licensed under LGPL 2.1 as packaged with libhugetlbfs +# (c) Eric Munson 2009 + +package TLBC::DataCollect; + +use warnings; +use strict; +use Carp; +use base; + +sub new() +{ +} + +## +# The setup method should take care of setting up the data collector for +# collecting event data. This method takes no args and returns $self + +sub setup() +{ +} + +## +# This method should the return the total event count as of its +# invocation. This method takes no args and it returns the total number +# of events. + +sub get_current_eventcount() +{ +} + +## +# This method will read the counter information from the data source. +# This was separated from get_current_eventcount to provide a logical +# way of handling multiple events. + +sub read_eventcount() +{ +} + +## +# The shutdown method should stop the data collection and do any clean up +# necessary. This method takes no args and returns $self + +sub shutdown() +{ +} + +1; diff --git a/default/libhugetlbfs/libhugetlbfs/TLBC/OpCollect.pm b/default/libhugetlbfs/libhugetlbfs/TLBC/OpCollect.pm new file mode 100644 index 0000000..6ec477f --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/TLBC/OpCollect.pm @@ -0,0 +1,185 @@ +# +# OpCollect.pm +# +# This module contains all the setup, data collection, and cleanup methods +# for collecting CPU performance counter information from oprofile. +# Licensed under LGPL 2.1 as packaged with libhugetlbfs +# (c) Eric Munson 2009 + +package TLBC::OpCollect; + +use warnings; +use strict; +use Carp; + +use FindBin qw($Bin); +use lib "$Bin/lib"; +use TLBC::DataCollect; + +our @ISA = qw(TLBC::DataCollect); + +my $reference; +my $report; +my (%event_map, %lowlevel); +my (%event_col_map, %event_name); + +#use interface 'DataCollect'; + +sub _clear_oprofile() +{ + my $self = shift; + system("opcontrol --reset > /dev/null 2>&1"); + system("opcontrol --stop > /dev/null 2>&1"); + system("opcontrol --reset > /dev/null 2>&1"); + system("opcontrol --deinit > /dev/null 2>&1"); + return $self; +} + +sub _get_event() +{ + my $self = shift; + my $event = shift; + my $lowlevel_event; + + $lowlevel_event = `$Bin/oprofile_map_events.pl --event $event 2>/dev/null`; + chomp($lowlevel_event); + if ($lowlevel_event eq "" || $lowlevel_event !~ /^[A-Z0-9_]+:[0-9]+/) { + die "Unable to find $event event for this CPU\n"; + } + $event_map{$event} = $lowlevel_event; + return $self; +} + +sub _setup_oprofile() +{ + my $self = shift; + my $vmlinux = shift; + my $refEvents = shift; + my $cmd = "$Bin/oprofile_start.sh --sample-cycle-factor 6 --sample-event-factor 2 --vmlinux=$vmlinux "; + foreach my $event (@{$refEvents}) { + $cmd .= " --event=$event"; + $self->_get_event($event); + } + $cmd .= " > /dev/null 2>&1"; + system($cmd) == 0 or return 0; + return $self; +} + +sub new() +{ + my $class = shift; + if ($reference) { + return $reference; + } + + $reference = {@_}; + bless($reference, $class); + return $reference; +} + +sub setup() +{ + my $self = shift; + my $vmlinux = shift; + my $refEvents = shift; + $self->_clear_oprofile(); + return $self->_setup_oprofile($vmlinux, $refEvents); +} + +sub samples() +{ + my $self = shift; + my $event = shift; + my $count = 0; + my $lowlevel; + my @vals; + $lowlevel = $event_map{$event}; + if (!$lowlevel) { + die "Unable to locate count and event for $event for this CPU.\n"; + } + @vals = split(/:/, $lowlevel); + return $vals[1]; +} + +sub _get_column() +{ + my $self = shift; + my $event = shift; + my @results; + my $line; + my $col = $event_col_map{$event}; + + if ($col) { + return $col; + } + + @results = split(/\n/, $report); + foreach $line (@results) { + if ($line =~ /$event.*\|/) { + my @vals = split(/\|/, $line); + my $size = @vals; + + for (my $i = 0; $i < $size; $i++) { + if ($vals[$i] =~ /$event/) { + $event_col_map{$event} = $i; + return $i; + } + } + die "Unable to find event column.\n"; + } + } + die "Unable to find column labels.\n"; +} + +sub get_current_eventcount() +{ + my @results; + my $line; + my $hits = 0; + my $self = shift; + my $binName = shift; + my $event = shift; + my $col = 0; + + my $lowlevel = $event_map{$event}; + if (!$lowlevel) { + die "Unable to locate event for $event for this CPU.\n"; + } + my @vals = split(/:/, $lowlevel); + $event = $vals[0]; + # The event column in opreport only uses the first 12 letters of + # the event name + $event = substr($event, 0, 12); + @results = split(/\n/, $report); + $col = $self->_get_column($event); + + foreach $line (@results) { + if ($line !~ /^\s+[0-9]/) { + next; + } + if ($binName eq "/" || $line =~ /$binName/) { + chomp($line); + $line =~ s/^\s+//; + $line =~ s/\s+$//; + $line =~ s/\s+/ /g; + my @vals = split(/ /, $line); + $hits += $vals[$col * 2]; + } + } + return $hits; +} + +sub read_eventcount() +{ + system("opcontrol --dump > /dev/null 2>&1"); + $report = `opreport -x 2> /dev/null`; +} + +sub shutdown() +{ + my $self = shift; + _clear_oprofile(); + return $self; +} + +1; diff --git a/default/libhugetlbfs/libhugetlbfs/TLBC/PerfCollect.pm b/default/libhugetlbfs/libhugetlbfs/TLBC/PerfCollect.pm new file mode 100644 index 0000000..f44d920 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/TLBC/PerfCollect.pm @@ -0,0 +1,144 @@ +# +# PerfCollect.pm +# +# This module contains all the setup, data collection, and cleanup methods +# for collecting CPU performance counter information from Ingo's perf tool. +# Licensed under LGPL 2.1 as packaged with libhugetlbfs +# (c) Eric Munson 2009 + +package TLBC::PerfCollect; + +use warnings; +use strict; +use Carp; + +use FindBin qw($Bin); +use lib "$Bin/lib"; +use TLBC::DataCollect; + +our @ISA = qw(TLBC::DataCollect); + +my $perf_output = "/tmp/perf_" . $$ . ".data"; +my $reference; +my $report; +my $perf_pid; +my $perf_bin; +my $vmlinux; +my (%map_event_name, %map_event_mask); + +$map_event_name{"i386###dtlb_miss"} = "-e dTLB-miss"; +$map_event_name{"x86-64###dtlb_miss"} = "-e dTLB-miss"; +$map_event_name{"ppc64###dtlb_miss"} = "-e dTLB-miss"; + +sub _get_event() +{ + my $self = shift; + my $arch = shift; + my $event = shift; + my $ret; + + $ret = $map_event_name{"$arch###$event"}; + if (not defined $ret or $ret eq "") { + return ""; + } + return $ret; +} + +sub new() +{ + my $class = shift; + if ($reference) { + return $reference; + } + + $reference = {@_}; + bless($reference, $class); + return $reference; +} + +sub setup() +{ + my $self = shift; + $vmlinux = shift; + my $event_name = shift; + $perf_bin = `which perf`; + if (!$perf_bin) { + return 0; + } + chomp($perf_bin); + + my $arch = `uname -m`; + chomp($arch); + $arch =~ s/i.86/i386/g; + my $event = $self->_get_event($arch, $event_name); + if ($event eq "") { + return 0; + } + + my $cmd = $perf_bin . " record -a -f -o $perf_output ". $event; + + $perf_pid = fork(); + if (not defined $perf_pid) { + return 0; + } elsif ($perf_pid == 0) { + exec $cmd or die "Failed to start perf monitor\n"; + } else { + return $self; + } +} + +sub samples() +{ + return 1; +} + +sub get_current_eventcount() +{ + my $self = shift; + my $binName = shift; + my $count = 0; + my $total; + my $line; + my $hits; + my @lines = split(/\n/, $report); + + # @lines[2] will contain the total number of samples + $lines[2] =~ m/(\d+)/; + $total = $1; + + if ($binName eq "vmlinux") { + $binName = "kernel"; + } + + foreach $line (@lines) { + if ($binName eq "/" || $line =~ /$binName/) { + chomp($line); + $line =~ s/^\s+//; + $line =~ s/\s+$//; + $line =~ m/(\d+\.\d+)%/; + # $1 should hold the percentage of hits for this + # $binName + $count += int(($1 / 100) * $total); + } + } + + return $count; +} + +sub read_eventcount() +{ + my $cmd = $perf_bin . " report -k $vmlinux -i $perf_output"; + $report = `$cmd`; +} + +sub shutdown() +{ + my $self = shift; + my $cmd = "kill $perf_pid"; + system($cmd); + unlink $perf_output; + return $self; +} + +1; + diff --git a/default/libhugetlbfs/libhugetlbfs/TLBC/Report.pm b/default/libhugetlbfs/libhugetlbfs/TLBC/Report.pm new file mode 100644 index 0000000..bea0c05 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/TLBC/Report.pm @@ -0,0 +1,58 @@ +# +# Report.pm +# +# This very simple module is simply for keeping report generation +# in the same place. The code is basically a glorified collection +# of print statements +# Licensed under LGPL 2.1 as packaged with libhugetlbfs +# (c) Mel Gorman 2003 + +package TLBC::Report; +require Exporter; +use vars qw (@ISA @EXPORT); +use strict; +my $verbose; + +@ISA = qw(Exporter); +@EXPORT = qw(&setVerbose &printVerbose &reportPrint &reportOpen &reportClose); + +## +# setVerbose - Set the verbose flag +sub setVerbose { + $verbose = 1; +} + +## +# printVerbose - Print debugging messages if verbose is set +# @String to print +sub printVerbose { + $verbose && print @_; +} + +## +# reportPrint - Print a string verbatim to the report +# @string: String to print +sub reportPrint { + my ($string) = @_; + + print HTML $string; +} + +## +# +# reportOpen - Open a new report +# @filename: Filename of report to open +sub reportOpen { + my ($filename) = @_; + + open (HTML, ">$filename") or die("Could not open $filename"); +} + +## +# +# reportClose - Close the report +sub reportClose { + close HTML; +} + +1; diff --git a/default/libhugetlbfs/libhugetlbfs/alloc.c b/default/libhugetlbfs/libhugetlbfs/alloc.c new file mode 100644 index 0000000..bce9464 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/alloc.c @@ -0,0 +1,337 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * alloc.c - Simple allocator of regions backed by hugepages + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE +#include <fcntl.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <time.h> +#include <sys/mman.h> +#include <sys/types.h> + +#include "hugetlbfs.h" +#include "libhugetlbfs_internal.h" + +/* Allocate base pages if huge page allocation fails */ +static void *fallback_base_pages(size_t len, ghp_t flags) +{ + int fd; + void *buf; + INFO("get_huge_pages: Falling back to base pages\n"); + + /* + * Map /dev/zero instead of MAP_ANONYMOUS avoid VMA mergings. Freeing + * pages depends on /proc/pid/maps to find lengths of allocations. + * This is a bit lazy and if found to be costly due to either the + * extra open() or virtual address space usage, we could track active + * mappings in a lock-protected list instead. + */ + fd = open("/dev/zero", O_RDWR); + if (fd == -1) { + ERROR("get_huge_pages: Failed to open /dev/zero for fallback"); + return NULL; + } + + buf = mmap(NULL, len, + PROT_READ|PROT_WRITE, + MAP_PRIVATE, + fd, 0); + if (buf == MAP_FAILED) { + WARNING("Base page fallback failed: %s\n", strerror(errno)); + buf = NULL; + } + close(fd); + + return buf; +} + +/** + * get_huge_pages - Allocate an amount of memory backed by huge pages + * len: Size of the region to allocate, must be hugepage-aligned + * flags: Flags specifying the behaviour of the function + * + * This function allocates a region of memory that is backed by huge pages + * and hugepage-aligned. This is not a suitable drop-in for malloc() but a + * a malloc library could use this function to create a new fixed-size heap + * similar in principal to what morecore does for glibc malloc. + */ +void *get_huge_pages(size_t len, ghp_t flags) +{ + void *buf; + int buf_fd = -1; + int mmap_reserve = __hugetlb_opts.no_reserve ? MAP_NORESERVE : 0; + int mmap_hugetlb = 0; + int ret; + + /* Catch an altogether-too easy typo */ + if (flags & GHR_MASK) + ERROR("Improper use of GHR_* in get_huge_pages()\n"); + +#ifdef MAP_HUGETLB + mmap_hugetlb = MAP_HUGETLB; +#endif + + if (__hugetlb_opts.map_hugetlb && + gethugepagesize() == kernel_default_hugepage_size()) { + /* Because we can use MAP_HUGETLB, we simply mmap the region */ + buf = mmap(NULL, len, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_ANONYMOUS|mmap_hugetlb|mmap_reserve, + 0, 0); + } else { + /* Create a file descriptor for the new region */ + buf_fd = hugetlbfs_unlinked_fd(); + if (buf_fd < 0) { + WARNING("Couldn't open hugetlbfs file for %zd-sized buffer\n", + len); + return NULL; + } + + /* Map the requested region */ + buf = mmap(NULL, len, PROT_READ|PROT_WRITE, + MAP_PRIVATE|mmap_reserve, buf_fd, 0); + } + + if (buf == MAP_FAILED) { + if (buf_fd >= 0) + close(buf_fd); + + WARNING("get_huge_pages: New region mapping failed (flags: 0x%lX): %s\n", + flags, strerror(errno)); + return NULL; + } + + /* Fault the region to ensure accesses succeed */ + ret = hugetlbfs_prefault(buf, len); + if (ret != 0) { + munmap(buf, len); + if (buf_fd >= 0) + close(buf_fd); + + WARNING("get_huge_pages: Prefaulting failed (flags: 0x%lX): %s\n", + flags, strerror(ret)); + return NULL; + } + + /* Close the file so we do not have to track the descriptor */ + if (buf_fd >= 0 && close(buf_fd) != 0) { + WARNING("Failed to close new buffer fd: %s\n", strerror(errno)); + munmap(buf, len); + return NULL; + } + + /* woo, new buffer of shiny */ + return buf; +} + +#define MAPS_BUF_SZ 4096 +static void __free_huge_pages(void *ptr, int aligned) +{ + FILE *fd; + char line[MAPS_BUF_SZ]; + unsigned long start = 0, end = 0; + unsigned long palign = 0, hpalign = 0; + unsigned long hpalign_end = 0; + + /* + * /proc/self/maps is used to determine the length of the original + * allocation. As mappings are based on different files, we can + * assume that maps will not merge. If the hugepages were truly + * anonymous, this assumption would be broken. + */ + fd = fopen("/proc/self/maps", "r"); + if (!fd) { + ERROR("Failed to open /proc/self/maps\n"); + return; + } + + /* + * An unaligned address allocated by get_hugepage_region() + * could be either page or hugepage aligned + */ + if (!aligned) { + palign = ALIGN_DOWN((unsigned long)ptr, getpagesize()); + hpalign = ALIGN_DOWN((unsigned long)ptr, gethugepagesize()); + } + + /* Parse /proc/maps for address ranges line by line */ + while (!feof(fd)) { + char *bufptr; + char *saveptr = NULL; + + /* Read a line of input */ + if (fgets(line, MAPS_BUF_SZ, fd) == NULL) + break; + + /* Parse the line to get the start and end of each mapping */ + bufptr = strtok_r(line, " ", &saveptr); + bufptr = strtok_r(bufptr, "-", &saveptr); + start = strtoull(bufptr, NULL, 16); + bufptr = strtok_r(NULL, "-", &saveptr); + + /* If the correct mapping is found, remove it */ + if (start == (unsigned long)ptr) { + end = strtoull(bufptr, NULL, 16); + munmap(ptr, end - start); + break; + } + + /* If the passed address is aligned, just move along */ + if (aligned) + continue; + + /* + * If an address is hpage-aligned, record it but keep looking. + * We might find a page-aligned or exact address later + */ + if (start == hpalign) { + hpalign_end = strtoull(bufptr, NULL, 16); + continue; + } + + /* If an address is page-aligned, free it */ + if (start == palign) { + end = strtoull(bufptr, NULL, 16); + munmap((void *)start, end - start); + break; + } + + } + + /* + * If no exact or page-aligned address was found, check for a + * hpage-aligned address. If found, free it, otherwise warn that + * the ptr pointed nowhere + */ + if (end == 0) { + if (hpalign_end == 0) + ERROR("hugepages_free using invalid or double free\n"); + else + munmap((void *)hpalign, hpalign_end - hpalign); + } + + fclose(fd); +} + +/** + * free_huge_pages - Free a region allocated that was backed by large pages + * ptr - The pointer to the buffer returned by get_huge_pages() + * + * This function finds a region to free based on the contents of + * /proc/pid/maps. The assumption is made that the ptr is the start of + * a hugepage region allocated with free_huge_pages. No checking is made + * that the pointer is to a hugepage backed region. + */ +void free_huge_pages(void *ptr) +{ + __free_huge_pages(ptr, 1); +} + +/* + * Offset the buffer using bytes wasted due to alignment to avoid using the + * same cache lines for the start of every buffer returned by + * get_huge_pages(). A small effort is made to select a random cacheline + * rather than sequential lines to give decent behaviour on average. + */ +void *cachecolor(void *buf, size_t len, size_t color_bytes) +{ + static long cacheline_size = 0; + static int linemod = 0; + char *bytebuf = (char *)buf; + int numlines; + int line = 0; + + /* Lookup our cacheline size once */ + if (cacheline_size == 0) { + cacheline_size = sysconf(_SC_LEVEL2_CACHE_LINESIZE); + linemod = time(NULL); + } + + numlines = color_bytes / cacheline_size; + DEBUG("%d lines of cacheline size %ld due to %zd wastage\n", + numlines, cacheline_size, color_bytes); + if (numlines) { + line = linemod % numlines; + bytebuf += cacheline_size * line; + + /* Pseudo-ish random line selection */ + linemod += len % numlines; + } + DEBUG("Using line offset %d from start\n", line); + + return bytebuf; +} + +/** + * get_hugepage_region - Allocate an amount of memory backed by huge pages + * + * len: Size of the region to allocate + * flags: Flags specifying the behaviour of the function + * + * This function allocates a region of memory backed by huge pages. Care should + * be taken when using this function as a drop-in replacement for malloc() as + * memory can be wasted if the length is not hugepage-aligned. This function + * is more relaxed than get_huge_pages() in that it allows fallback to small + * pages when requested. + */ +void *get_hugepage_region(size_t len, ghr_t flags) +{ + size_t aligned_len, wastage; + void *buf; + + /* Catch an altogether-too easy typo */ + if (flags & GHP_MASK) + ERROR("Improper use of GHP_* in get_hugepage_region()\n"); + + /* Align the len parameter to a hugepage boundary and allocate */ + aligned_len = ALIGN(len, gethugepagesize()); + buf = get_huge_pages(aligned_len, GHP_DEFAULT); + if (buf == NULL && (flags & GHR_FALLBACK)) { + aligned_len = ALIGN(len, getpagesize()); + buf = fallback_base_pages(len, flags); + } + + /* Calculate wastage for coloring */ + wastage = aligned_len - len; + if (wastage != 0 && !(flags & GHR_COLOR)) + DEBUG("get_hugepage_region: Wasted %zd bytes due to alignment\n", + wastage); + + /* Only colour if requested */ + if (flags & GHR_COLOR) + buf = cachecolor(buf, len, wastage); + + return buf; +} + +/** + * free_hugepage_region - Free a region allocated by get_hugepage_region + * ptr - The pointer to the buffer returned by get_hugepage_region + * + * This function finds a region to free based on the contents of + * /proc/pid/maps. The assumption is made that the ptr is the start of + * a hugepage region allocated with get_hugepage_region. No checking is made + * that the pointer is to a hugepage backed region. + */ +void free_hugepage_region(void *ptr) +{ + __free_huge_pages(ptr, 0); +} diff --git a/default/libhugetlbfs/libhugetlbfs/contrib/tlbmiss_cost.sh b/default/libhugetlbfs/libhugetlbfs/contrib/tlbmiss_cost.sh new file mode 100755 index 0000000..1f1e234 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/contrib/tlbmiss_cost.sh @@ -0,0 +1,693 @@ +#!/bin/bash +# Wrapper script around calibrator or oprofile, used to calculate the number +# of cycles it takes to handle a tlb miss. calibrator will need to be +# downloaded seperately to be used here, otherwise oprofile will be used. +# oprofile does not generate accurate results on x86 or x86_64. +# +# Both methods were lifted from a paper by Mel Gorman <mel@xxxxxxxxx> +# +# Licensed under LGPL 2.1 as packaged with libhugetlbfs +# (c) Eric B Munson 2009 +# (c) Mel Gorman 2009 + +# calibrator can be found here: +# http://homepages.cwi.nl/~manegold/Calibrator/v0.9e/calibrator.c +# and should be compiled with this command line: +# gcc calibrator.c -lm -o calibrator +# and then placed in the same directory as this script +# Note: Do not use any optimisation to avoid skewing the results + +# trace == 3 +# info == 2 (default, should remain quiet in practicet) +# error == 1 +VERBOSE=2 +MHZ=0 + +cpumhz() { + MAX_MHZ=0 + SYSFS_SCALING=/sys/devices/system/cpu/cpu0/cpufreq/scaling_available_frequencies + + # Use sysfs if available + if [ -e $SYSFS_SCALING ]; then + for CURR_MHZ in `cat $SYSFS_SCALING`; do + CURR_MHZ=$(($CURR_MHZ/1000)) + if [ $CURR_MHZ -gt $MAX_MHZ ]; then + MAX_MHZ=$CURR_MHZ + fi + done + MHZ=$MAX_MHZ + return + fi + + # Otherwise, use /proc/cpuinfo. Guess what field name is needed. + # In most cases, it's cpu MHz but there will be exceptions + FNAME="cpu MHz" + FINDEX=4 + case "`uname -m`" in + ppc64) + FNAME="clock" + FINDEX=3 + ;; + esac + + # Take a hundred samples in case of CPU frequency scaling artifically + # returning a low value. The multiple samples should wake up the CPU + for SAMPLE in `seq 1 100`; do + for CURR_MHZ in `grep "$FNAME" /proc/cpuinfo | awk "{print \\\$$FINDEX}"`; do + CURR_MHZ=${CURR_MHZ/.*} + if [ "$CURR_MHZ" = "" ]; then + echo ERROR: Unable to extract CPU speed from /proc + exit -1 + fi + + if [ $CURR_MHZ -gt $MAX_MHZ ]; then + MAX_MHZ=$CURR_MHZ + fi + done + done + + MHZ=$MAX_MHZ + return +} + +# Print help message +usage() { + echo "tlbmiss_cost.sh [options] +options: + --fetch-calibrator Download and build calibrator if not in path + --fetch-stream Download and build STREAM if not in path + -c, --calibrator Path to calibrator helper if not in path + -s, --stream Path to STREAM helper if not in path + -q, --quiet Be less verbose in output + -v, --verbose Be more verbose in output + -h, --help Print this help message" + exit 1 +} +# Print verbose message to stderr if --verbose is specified +print_trace() +{ + if [ $VERBOSE -ge 3 ]; then + echo "TRACE: $@" 1>&2 + fi +} + +print_error() +{ + if [ $VERBOSE -ge 1 ]; then + echo "ERROR: $@" 1>&2 + fi +} + +die() +{ + print_error $@ + exit -1 +} + +calibrator_fetch() +{ + if [ "`which calibrator`" != "" -o -e ./calibrator ]; then + echo Calibrator is already in path or in current directory + return + fi + + TMPFILE=`mktemp`.c + if [ "$TMPFILE" = "" ]; then + die Failed to create tmpfile + fi + trap "rm $TMPFILE; exit" INT + + WGET=`which wget 2> /dev/null` + if [ "$WGET" = "" ]; then + rm $TMPFILE + die wget is not installed, cannot fetch calibrator.c + fi + + wget http://homepages.cwi.nl/~manegold/Calibrator/v0.9e/calibrator.c -O $TMPFILE || die Failed to download calibrator.c + + # Calibrator defines a function round() which sometimes collides with + # a system-defined version. This patch removes the naming collision + PATCHFILE=`basename $TMPFILE` + echo "--- $PATCHFILE.orig 2010-02-02 14:34:38.000000000 +0000 ++++ $PATCHFILE 2010-02-02 14:35:27.000000000 +0000 +@@ -128,7 +128,7 @@ + exit(1); + } + +-lng round(dbl x) ++lng calibrator_round(dbl x) + { + return (lng)(x + 0.5); + } +@@ -890,16 +890,16 @@ + fprintf(fp, \")\n\"); + fprintf(fp, \"set y2tics\"); + for (l = 0, s = \" (\"; l <= cache->levels; l++, s = \", \") { +- if (!delay) fprintf(fp, \"%s'(%ld)' %f\", s, round(CYperIt(cache->latency1[l] - delay)), NSperIt(cache->latency1[l] - delay)); +- else fprintf(fp, \"%s'(%ld)' %f\", s, round(CYperIt(cache->latency2[l] - delay)), NSperIt(cache->latency2[l] - delay)); ++ if (!delay) fprintf(fp, \"%s'(%ld)' %f\", s, calibrator_round(CYperIt(cache->latency1[l] - delay)), NSperIt(cache->latency1[l] - delay)); ++ else fprintf(fp, \"%s'(%ld)' %f\", s, calibrator_round(CYperIt(cache->latency2[l] - delay)), NSperIt(cache->latency2[l] - delay)); + } + for (y = 1; y <= yh; y *= 10) { + fprintf(fp, \"%s'%1.3g' %ld\", s, (dbl)(y * MHz) / 1000.0, y); + } + fprintf(fp, \")\n\"); + for (l = 0; l <= cache->levels; l++) { +- if (!delay) z = (dbl)round(CYperIt(cache->latency1[l] - delay)) * 1000.0 / (dbl)MHz; +- else z = (dbl)round(CYperIt(cache->latency2[l] - delay)) * 1000.0 / (dbl)MHz; ++ if (!delay) z = (dbl)calibrator_round(CYperIt(cache->latency1[l] - delay)) * 1000.0 / (dbl)MHz; ++ else z = (dbl)calibrator_round(CYperIt(cache->latency2[l] - delay)) * 1000.0 / (dbl)MHz; + fprintf(fp, \"set label %ld '(%1.3g) ' at %f,%f right\n\", l + 1, z, xl, z); + fprintf(fp, \"set arrow %ld from %f,%f to %f,%f nohead lt 0\n\", l + 1, xl, z, xh, z); + } +@@ -986,16 +986,16 @@ + fprintf(fp, \"%s'<L1>' %ld)\n\", s, TLB->mincachelines); + fprintf(fp, \"set y2tics\"); + for (l = 0, s = \" (\"; l <= TLB->levels; l++, s = \", \") { +- if (!delay) fprintf(fp, \"%s'(%ld)' %f\", s, round(CYperIt(TLB->latency1[l] - delay)), NSperIt(TLB->latency1[l] - delay)); +- else fprintf(fp, \"%s'(%ld)' %f\", s, round(CYperIt(TLB->latency2[l] - delay)), NSperIt(TLB->latency2[l] - delay)); ++ if (!delay) fprintf(fp, \"%s'(%ld)' %f\", s, calibrator_round(CYperIt(TLB->latency1[l] - delay)), NSperIt(TLB->latency1[l] - delay)); ++ else fprintf(fp, \"%s'(%ld)' %f\", s, calibrator_round(CYperIt(TLB->latency2[l] - delay)), NSperIt(TLB->latency2[l] - delay)); + } + for (y = 1; y <= yh; y *= 10) { + fprintf(fp, \"%s'%1.3g' %ld\", s, (dbl)(y * MHz) / 1000.0, y); + } + fprintf(fp, \")\n\"); + for (l = 0; l <= TLB->levels; l++) { +- if (!delay) z = (dbl)round(CYperIt(TLB->latency1[l] - delay)) * 1000.0 / (dbl)MHz; +- else z = (dbl)round(CYperIt(TLB->latency2[l] - delay)) * 1000.0 / (dbl)MHz; ++ if (!delay) z = (dbl)calibrator_round(CYperIt(TLB->latency1[l] - delay)) * 1000.0 / (dbl)MHz; ++ else z = (dbl)calibrator_round(CYperIt(TLB->latency2[l] - delay)) * 1000.0 / (dbl)MHz; + fprintf(fp, \"set label %ld '(%1.3g) ' at %f,%f right\n\", l + 1, z, xl, z); + fprintf(fp, \"set arrow %ld from %f,%f to %f,%f nohead lt 0\n\", l + 1, xl, z, xh, z); + } +@@ -1023,9 +1023,9 @@ + FILE *fp = stdout; + + fprintf(fp, \"CPU loop + L1 access: \"); +- fprintf(fp, \" %6.2f ns = %3ld cy\n\", NSperIt(cache->latency1[0]), round(CYperIt(cache->latency1[0]))); ++ fprintf(fp, \" %6.2f ns = %3ld cy\n\", NSperIt(cache->latency1[0]), calibrator_round(CYperIt(cache->latency1[0]))); + fprintf(fp, \" ( delay: \"); +- fprintf(fp, \" %6.2f ns = %3ld cy )\n\", NSperIt(delay), round(CYperIt(delay))); ++ fprintf(fp, \" %6.2f ns = %3ld cy )\n\", NSperIt(delay), calibrator_round(CYperIt(delay))); + fprintf(fp, \"\n\"); + fflush(fp); + } +@@ -1047,8 +1047,8 @@ + fprintf(fp, \" %3ld KB \", cache->size[l] / 1024); + } + fprintf(fp, \" %3ld bytes \", cache->linesize[l + 1]); +- fprintf(fp, \" %6.2f ns = %3ld cy \" , NSperIt(cache->latency2[l + 1] - cache->latency2[l]), round(CYperIt(cache->latency2[l + 1] - cache->latency2[l]))); +- fprintf(fp, \" %6.2f ns = %3ld cy\n\", NSperIt(cache->latency1[l + 1] - cache->latency1[l]), round(CYperIt(cache->latency1[l + 1] - cache->latency1[l]))); ++ fprintf(fp, \" %6.2f ns = %3ld cy \" , NSperIt(cache->latency2[l + 1] - cache->latency2[l]), calibrator_round(CYperIt(cache->latency2[l + 1] - cache->latency2[l]))); ++ fprintf(fp, \" %6.2f ns = %3ld cy\n\", NSperIt(cache->latency1[l + 1] - cache->latency1[l]), calibrator_round(CYperIt(cache->latency1[l + 1] - cache->latency1[l]))); + } + fprintf(fp, \"\n\"); + fflush(fp); +@@ -1075,9 +1075,9 @@ + } else { + fprintf(fp, \" %3ld KB \", TLB->pagesize[l + 1] / 1024); + } +- fprintf(fp, \" %6.2f ns = %3ld cy \", NSperIt(TLB->latency2[l + 1] - TLB->latency2[l]), round(CYperIt(TLB->latency2[l + 1] - TLB->latency2[l]))); ++ fprintf(fp, \" %6.2f ns = %3ld cy \", NSperIt(TLB->latency2[l + 1] - TLB->latency2[l]), calibrator_round(CYperIt(TLB->latency2[l + 1] - TLB->latency2[l]))); + /* +- fprintf(fp, \" %6.2f ns = %3ld cy\" , NSperIt(TLB->latency1[l + 1] - TLB->latency1[l]), round(CYperIt(TLB->latency1[l + 1] - TLB->latency1[l]))); ++ fprintf(fp, \" %6.2f ns = %3ld cy\" , NSperIt(TLB->latency1[l + 1] - TLB->latency1[l]), calibrator_round(CYperIt(TLB->latency1[l + 1] - TLB->latency1[l]))); + */ + fprintf(fp, \"\n\"); + } +" | patch -d /tmp + + LICENSE_END=`grep -n "^ \*/" $TMPFILE | head -1 | cut -f1 -d:` + echo Displaying calibrator license + head -$LICENSE_END $TMPFILE + echo + echo Calibrator is an external tool used by tlbmiss_cost.sh. The license + echo for this software is displayed above. Are you willing to accept the + echo -n "terms of this license [Y/N]? " + read INPUT + + if [ "$INPUT" != "Y" -a "$INPUT" != "y" ]; then + rm $TMPFILE + echo Bailing... + return + fi + echo Building... + gcc $TMPFILE -w -lm -o calibrator || die Failed to compile calibrator + echo Calibrator available at ./calibrator. For future use, run tlbmiss_cost.sh + echo from current directory or copy calibrator into your PATH + echo + + rm $TMPFILE +} + +calibrator_calc() +{ + if [ "$CALIBRATOR" = "" ]; then + CALIBRATOR=`which calibrator 2>/dev/null` + if [ "$CALIBRATOR" = "" ]; then + CALIBRATOR="./calibrator" + fi + fi + + if [[ ! -x $CALIBRATOR ]]; then + die "Unable to locate calibrator. Consider using --fetch-calibrator." + fi + + cpumhz + SIZE=$((13*1048576)) + STRIDE=3932160 + PREFIX=tlbmiss-cost-results + TMPFILE=`mktemp` + TOLERANCE=2 + MATCH_REQUIREMENT=3 + MEASURED=0 + FAILED_MEASURE=0 + + if [ "$TMPFILE" = "" ]; then + die Failed to create tmpfile + fi + if [ "$MHZ" = "" ]; then + die Failed to calculate CPU MHz + fi + trap "rm $TMPFILE*; exit" INT + + MATCHED=0 + LAST_LATENCY_CYCLES=-1 + + print_trace Beginning TLB measurement using calibrator + print_trace Measured CPU Speed: $MHZ MHz + print_trace Starting Working Set Size \(WSS\): $SIZE bytes + print_trace Required tolerance for match: $MATCH_REQUIREMENT cycles + + # Keep increasing size until TLB latency is being measured consistently + while [ $MATCHED -lt $MATCH_REQUIREMENT ]; do + $CALIBRATOR $MHZ $SIZE $PREFIX > $TMPFILE 2>&1 + if [ $? != 0 ]; then + SIZE=$(($SIZE*2)) + continue + fi + + LATENCY_CYCLES=`grep ^TLBs: -A 2 $TMPFILE | tail -1 | awk -F = '{print $2}'` + LATENCY_CYCLES=`echo $LATENCY_CYCLES | awk '{print $1}'` + + if [ "$LATENCY_CYCLES" = "" ]; then + FAILED_MEASURE=$(($FAILED_MEASURE+1)) + if [ $MEASURED -eq 0 ]; then + SIZE=$(($SIZE*3/2)) + FAILED_MEASURE=0 + else + if [ $FAILED_MEASURE -eq 3 ]; then + SIZE=$(($SIZE+$STRIDE)) + FAILED_MEASURE=0 + print_trace No TLB Latency measured: New WSS $SIZE + else + print_trace No TLB Latency measured: Retrying + fi + fi + continue + fi + LOW_TOLERANCE=$(($LATENCY_CYCLES-$TOLERANCE)) + HIGH_TOLERANCE=$(($LATENCY_CYCLES+$TOLERANCE)) + if [ $LAST_LATENCY_CYCLES -ge $LOW_TOLERANCE -a \ + $LAST_LATENCY_CYCLES -le $HIGH_TOLERANCE ]; then + MATCHED=$(($MATCHED+1)) + print_trace Measured TLB Latency $LATENCY_CYCLES cycles within tolerance. Matched $MATCHED/$MATCH_REQUIREMENT + else + if [ $LAST_LATENCY_CYCLES -ne -1 ]; then + print_trace Measured TLB Latency $LATENCY_CYCLES cycles outside tolerance + fi + MATCHED=0 + fi + + LAST_LATENCY_CYCLES=$LATENCY_CYCLES + SIZE=$(($SIZE+$STRIDE)) + MEASURED=$(($MEASURED+1)) + FAILED_MEASURE=0 + done + rm $TMPFILE* + rm tlbmiss-cost-results* +} + +# This method uses the stream memory benchmark which can be found here: +# http://www.cs.virginia.edu/stream/FTP/Code/stream.c +# and should be compiled with this command line: +# gcc -m32 -O3 -DN=44739240 stream.c -o STREAM +# and then placed in the same directory as this script + +stream_fetch() +{ + # STREAM binary is in caps as there is a commonly-available binary + # called stream that is packaged with ImageMagick. This avoids some + # confusion + if [ "`which STREAM`" != "" -o -e ./STREAM ]; then + echo STREAM is already in path or in current directory + return + fi + + TMPFILE=`mktemp`.c + if [ "$TMPFILE" = "" ]; then + die Failed to create tmpfile + fi + trap "rm $TMPFILE; exit" INT + + WGET=`which wget 2> /dev/null` + if [ "$WGET" = "" ]; then + rm $TMPFILE + die wget is not installed, cannot fetch stream.c + fi + + wget http://www.cs.virginia.edu/stream/FTP/Code/stream.c -O $TMPFILE || die Failed to download stream.c + + LICENSE_END=`grep -n "^/\*--" $TMPFILE | tail -1 | cut -f1 -d:` + echo Displaying STREAM license + head -$LICENSE_END $TMPFILE + echo + echo STREAM is an external tool used by tlbmiss_cost.sh. The license + echo for this software is displayed above. Are you willing to accept the + echo -n "terms of this license [Y/N]? " + read INPUT + + if [ "$INPUT" != "Y" -a "$INPUT" != "y" ]; then + rm $TMPFILE + echo Bailing... + return + fi + echo Building... + gcc -m32 -O3 -w -DN=44739240 $TMPFILE -o STREAM || die Failed to compile STREAM + echo STREAM is available at ./STREAM. For future use, run tlbmiss_cost.sh + echo from current directory or copy STREAM into your PATH + echo + + rm $TMPFILE +} + +seperate_dtlb_pagewalk_groups() +{ + TIMER_DTLB_EVENT=`oprofile_map_events.pl --event timer | cut -d: -f1 2> /dev/null` + TIMER_WALK_EVENT=`oprofile_map_events.pl --event timer30 | cut -d: -f1 2> /dev/null` + + # Help debug problems launching oprofile + print_trace oprofile launch commands as follows + print_trace dtlb misses :: oprofile_start --event timer --event dtlb_miss --sample-cycle-factor 5 + print_trace tablewalk cycles :: oprofile_start --event timer30 --event tablewalk_cycles --sample-cycle-factor 5 --sample-event-factor $SAMPLE_EVENT_FACTOR + + print_trace Rerunning benchmark to measure number of DTLB misses + $OPST $VMLINUX --event timer --event dtlb_miss --sample-cycle-factor 5 >/dev/null 2>&1 || \ + die "Error starting oprofile, check oprofile_map_event.pl for appropriate timer and dtlb_miss events." + $STREAM >/dev/null 2>&1 + + opcontrol --stop >/dev/null 2>&1 + opcontrol --dump >/dev/null 2>&1 + + # First ensure that the location of event counters are where we + # expect them to be. The expectation is that the timer30 is in + # the first column and the tablewalk_cycles is in the third + SAMPLES_START=`opreport | grep -n "samples|" | head -1 | cut -d: -f1` + if [ "$SAMPLES_START" = "" ]; then + die Could not establish start of samples from opreport + SAMPLES_START=$(($COUNT_START+1)) + fi + INDEX=`opreport | head -$SAMPLES_START | grep "^Counted .* events" | grep -n $TIMER_DTLB_EVENT | cut -d: -f1` + TIMER_DTLB_FIELD=$((1+2*($INDEX - 1))) + if [ $TIMER_DTLB_FIELD -eq 1 ]; then + DTLB_TIMER_INDEX=1 + else + DTLB_TIMER_INDEX=2 + fi + + TIMER_DTLB_SCALE=`opreport | grep "$TIMER_DTLB_EVENT" | head -1 | sed 's/.* count \([0-9]*\).*/\1/'` + DTLB_SCALE=`opreport | grep "$DTLB_EVENT" | head -1 | sed 's/.* count \([0-9]*\).*/\1/'` + + RESULTS=`opreport | grep " STREAM" | head -1` + FIELD1=`echo "$RESULTS" | sed 's/[[:space:]]*\([0-9]*\).*/\1/'` + FIELD2=`echo "$RESULTS" | sed 's/[[:space:]]*[0-9]*[[:space:]]*[[:graph:]]*[[:space:]]*\([0-9]*\).*/\1/'` + RESULTS=`opreport | grep " vmlinux" | head -1` + KERNEL_FIELD1=`echo "$RESULTS" | sed 's/[[:space:]]*\([0-9]*\).*/\1/'` + KERNEL_FIELD2=`echo "$RESULTS" | sed 's/[[:space:]]*[0-9]*[[:space:]]*[[:graph:]]*[[:space:]]*\([0-9]*\).*/\1/'` + + if [ $DTLB_TIMER_INDEX -eq 1 ] ; then + TIMER_DTLB=$(($FIELD1+$KERNEL_FIELD1)) + DTLB=$(($FIELD2+$KERNEL_FIELD2)) + else + TIMER_DTLB=$(($FIELD2+$KERNEL_FIELD2)) + DTLB=$(($FIELD1+$KERNEL_FIELD1)) + fi + + print_trace Shutting down oprofile + opcontrol --shutdown >/dev/null 2>&1 + opcontrol --deinit >/dev/null 2>&1 + + # Next STREAM needs to be run measuring the tablewalk_cycles. Because + # of differences in the frequency CPU events occur, there are + # alterations in the timing. To make an accurate comparison, the + # cycle counts of the two profiles need to be very similar. oprofile + # does not give much help here in matching up different reports taking + # different readings so there is nothing really to do but run STREAM + # multiple times, scaling the events at different rates until a + # reasonably close match is found. + + # The cycle counts for two oprofiles must be within 10% of each other + TOLERANCE=$(($TIMER_DTLB*4/100)) + SAMPLE_EVENT_FACTOR=1 + LOW_TIMER_WALK=0 + HIGH_TIMER_WALK=0 + + print_trace Running benchmark to measure table walk cycles + while [ $TIMER_DTLB -ge $LOW_TIMER_WALK -a $TIMER_DTLB -ge $HIGH_TIMER_WALK ]; do + + if [ $LOW_TIMER_WALK -ne 0 ]; then + print_trace High diff with scaling x$LAST_SAMPLE_EVENT_FACTOR. Required $TIMER_DTLB +/ $TOLERANCE, got $TIMER_WALK + fi + + $OPST $VMLINUX --event timer30 --event tablewalk_cycles --sample-cycle-factor 5 --sample-event-factor $SAMPLE_EVENT_FACTOR >/dev/null 2>&1 || \ + die "Error starting oprofile, check oprofile_map_event.pl for appropriate timer30 and tablewalk_cycles events." + $STREAM >/dev/null 2>&1 + + opcontrol --stop >/dev/null 2>&1 + opcontrol --dump >/dev/null 2>&1 + + # Extract the event counts + TIMER_WALK_SCALE=`opreport | grep "$TIMER_WALK_EVENT" | head -1 | sed 's/.* count \([0-9]*\).*/\1/'` + WALK_SCALE=`opreport | grep "$WALK_EVENT" | head -1 | sed 's/.* count \([0-9]*\).*/\1/'` + + # This shouldn't happen. One would expect that the minimum sample + # rate for any of the timers in any groups is the same. If they + # differ, it might be a simple bug in oprofile_map_event that + # needs fixing. In the event this bug is reported, get the CPU + # type and the output of opcontrol --list-events + if [ $TIMER_DTLB_SCALE -ne $TIMER_WALK_SCALE ]; then + die Cycle CPUs were sampled at different rates. + fi + + RESULTS=`opreport | grep " STREAM" | head -1` + FIELD1=`echo "$RESULTS" | sed 's/[[:space:]]*\([0-9]*\).*/\1/'` + FIELD2=`echo "$RESULTS" | sed 's/[[:space:]]*[0-9]*[[:space:]]*[[:graph:]]*[[:space:]]*\([0-9]*\).*/\1/'` + RESULTS=`opreport | grep " vmlinux" | head -1` + KERNEL_FIELD1=`echo "$RESULTS" | sed 's/[[:space:]]*\([0-9]*\).*/\1/'` + KERNEL_FIELD2=`echo "$RESULTS" | sed 's/[[:space:]]*[0-9]*[[:space:]]*[[:graph:]]*[[:space:]]*\([0-9]*\).*/\1/'` + + if [ $DTLB_TIMER_INDEX -eq 1 ] ; then + TIMER_WALK=$(($FIELD1+$KERNEL_FIELD1)) + WALK=$(($FIELD2+$KERNEL_FIELD2)) + else + TIMER_WALK=$(($FIELD2+$KERNEL_FIELD2)) + WALK=$(($FIELD1+$KERNEL_FIELD1)) + fi + + LOW_TIMER_WALK=$(($TIMER_WALK-$TOLERANCE)) + HIGH_TIMER_WALK=$(($TIMER_WALK+$TOLERANCE)) + + # Scale faster if the difference between timers is huge + LAST_SAMPLE_EVENT_FACTOR=$SAMPLE_EVENT_FACTOR + if [ $(($TIMER_DTLB*3/4-$HIGH_TIMER_WALK)) -gt 0 ]; then + SAMPLE_EVENT_FACTOR=$(($SAMPLE_EVENT_FACTOR+3)) + elif [ $(($TIMER_DTLB*9/10-$HIGH_TIMER_WALK)) -gt 0 ]; then + SAMPLE_EVENT_FACTOR=$(($SAMPLE_EVENT_FACTOR+2)) + else + SAMPLE_EVENT_FACTOR=$(($SAMPLE_EVENT_FACTOR+1)) + fi + + opcontrol --shutdown >/dev/null 2>&1 + opcontrol --deinit >/dev/null 2>&1 + done + + print_trace "DTLB Scale: $DTLB_SCALE" + print_trace "Walk Scale: $WALK_SCALE" + print_trace "DTLB events: $DTLB" + print_trace "Walk events: $WALK" + print_trace "Cycle DTLB Scale: $TIMER_DTLB_SCALE" + print_trace "Cycle Walk Scale: $TIMER_WALK_SCALE" + print_trace "Cycle DTLB events: $TIMER_DTLB" + print_trace "Cycle Walk events: $TIMER_WALK" +} + +dtlb_pagewalk_same_group() +{ + print_trace oprofile launch command as follows + print_trace $OPST --event dtlb_miss --event tablewalk_cycles + + $OPST $VMLINUX --event dtlb_miss --event tablewalk_cycles > /dev/null 2>&1 || \ + die "Error starting oprofile, check oprofile_map_event.pl for appropriate dtlb_miss and tablewalk_cycles events." + $STREAM >/dev/null 2>&1 + + opcontrol --stop >/dev/null 2>&1 + opcontrol --dump >/dev/null 2>&1 + + # First ensure that the location of event counters are where we + # expect them to be. The expectation is that tablewalk_cycles is in + # the first column and the dtlb_misses is in the second + SAMPLES_START=`opreport | grep -n "samples|" | head -1 | cut -d: -f1` + if [ "$SAMPLES_START" = "" ]; then + die Could not establish start of samples from opreport + fi + INDEX=`opreport | head -$SAMPLES_START | grep "^Counted .* events" | grep -n $WALK_EVENT | cut -d: -f1` + WALK_FIELD=$((1+2*($INDEX - 1))) + if [ $WALK_FIELD -ne 1 ]; then + die Table walk events are not in the expected column, parse failure + fi + INDEX=`opreport | head -$SAMPLES_START | grep "^Counted .* events" | grep -n $DTLB_EVENT | cut -d: -f1` + DTLB_FIELD=$((1+2*($INDEX - 1))) + if [ $DTLB_FIELD -ne 3 ]; then + die DTLB miss events are not in the expected column, parse failure + fi + + # Columns look ok, extract the event counts + DTLB_SCALE=`opreport | grep "$DTLB_EVENT" | head -1 | sed 's/.* count \([0-9]*\).*/\1/'` + WALK_SCALE=`opreport | grep "$WALK_EVENT" | head -1 | sed 's/.* count \([0-9]*\).*/\1/'` + RESULTS=`opreport | grep " STREAM" | head -1` + WALK=`echo "$RESULTS" | sed 's/[[:space:]]*\([0-9]*\).*/\1/'` + DTLB=`echo "$RESULTS" | sed 's/[[:space:]]*[0-9]*[[:space:]]*[[:graph:]]*[[:space:]]*\([0-9]*\).*/\1/'` + RESULTS=`opreport | grep " vmlinux" | head -1` + KERN_TABLE_WALK=`echo "$RESULTS" | sed 's/[[:space:]]*\([0-9]*\).*/\1/'` + KERN_TLB_MISS=`echo "$RESULTS" | sed 's/[[:space:]]*[0-9]*[[:space:]]*[[:graph:]]*[[:space:]]*\([0-9]*\).*/\1/'` + + print_trace "DTLB Scale: $DTLB_SCALE" + print_trace "Walk Scale: $WALK_SCALE" + print_trace "DTLB events: $DTLB + $KERN_TLB_MISS = $(($DTLB+$KERN_TLB_MISS))" + print_trace "Walk events: $WALK + $KERN_TABLE_WALK = $(($WALK+$KERN_TABLE_WALK))" + + if [[ "$KERN_TLB_MISS" != "" ]]; then + DTLB=$(($DTLB+$KERN_TLB_MISS)) + fi + if [[ "$KERN_TABLE_WALK" != "" ]]; then + WALK=$(($WALK+$KERN_TABLE_WALK)) + fi + + opcontrol --shutdown >/dev/null 2>&1 + opcontrol --deinit >/dev/null 2>&1 +} + +oprofile_calc() +{ + if [ "$STREAM" = "" ]; then + STREAM="./STREAM" + fi + + if [[ ! -x $STREAM ]]; then + die "Unable to locate STREAM. Consider using --fetch-stream." + fi + + OPST=`which oprofile_start.sh` + if [ "$OPST" = "" ]; then + OPST="../oprofile_start.sh" + fi + + if [[ ! -x $OPST ]]; then + die "Unable to locate oprofile_start.sh." + fi + + print_trace Forcing shutdown of oprofile + opcontrol --shutdown >/dev/null 2>&1 + opcontrol --deinit >/dev/null 2>&1 + + print_trace Gathering the name of CPU events + WALK_EVENT=`oprofile_map_events.pl --event tablewalk_cycles | cut -d: -f1 2> /dev/null` + DTLB_EVENT=`oprofile_map_events.pl --event dtlb_miss | cut -d: -f1 2> /dev/null` + + GROUP1=`echo $WALK_EVENT | sed 's/.*\(GRP[0-9]*\)/\1/'` + GROUP2=`echo $DTLB_EVENT | sed 's/.*\(GRP[0-9]*\)/\1/'` + + print_trace Warming the benchmark to avoid page faults of the binary + $STREAM >/dev/null 2>&1 + + if [[ "$GROUP1" == "$GROUP2" ]] ; then + print_trace "Events are in the same group: $GROUP1, using one oprofile pass" + dtlb_pagewalk_same_group + else + print_trace "Events are in different groups: $GROUP1 and $GROUP2, using multiple oprofile passes" + seperate_dtlb_pagewalk_groups + fi + + WALK=$(($WALK*$WALK_SCALE)) + DTLB=$(($DTLB*$DTLB_SCALE)) + LAST_LATENCY_CYCLES=$(($WALK/$DTLB)) +} + +ARGS=`getopt -o c:s:fvqh --long calibrator:,stream:,vmlinux:,verbose,quiet,fetch-calibrator,fetch-stream,ignore-cache,help -n 'tlbmiss_cost.sh' -- "$@"` + +eval set -- "$ARGS" + +while true ; do + case "$1" in + -c|--calibrator) CALIBRATOR="$2" ; shift 2 ;; + -s|--stream) STREAM="$2" ; shift 2 ;; + --vmlinux) VMLINUX="--vmlinux $2" ; shift 2 ;; + -v|--verbose) VERBOSE=$(($VERBOSE+1)); shift;; + -q|--quiet) VERBOSE=$(($VERBOSE-1)); shift;; + -f|--ignore-cache) IGNORE_CACHE=yes; shift;; + --fetch-calibrator) calibrator_fetch; shift;; + --fetch-stream) stream_fetch; shift;; + -h|--help) usage; shift;; + "") shift ; break ;; + "--") shift ; break ;; + *) die "Unrecognized option $1" ;; + esac +done + +HOSTNAME=`hostname 2> /dev/null` +ARCH=`uname -m | sed -e s/i.86/i386/` + +if [ "$IGNORE_CACHE" != "yes" ]; then + print_trace Searching for a cached value for TLB miss + + # Look for a cached entry for the TLB miss value + if [ -e /etc/tlbmisscost.conf ]; then + print_trace Checking /etc/tlbmisscost.conf + grep TLB_MISS_COST /etc/tlbmisscost.conf + if [ $? -eq 0 ]; then + exit 0 + fi + fi + + # Look for a cached entry in home + if [ -e $HOME/.tlbmisscostrc ]; then + print_trace Checking $HOME/.tlbmisscostrc + HOSTNAME=`hostname 2> /dev/null` + if [ "$HOSTNAME" != "" -a "$HOSTNAME" != "localhost" ]; then + grep $HOSTNAME:TLB_MISS_COST $HOME/.tlbmisscostrc | sed -e "s/^$HOSTNAME://" + if [ $? -eq 0 ]; then + exit 0 + fi + fi + fi + print_trace Cached value unavailable +fi + +if [[ "$ARCH" == "ppc64" || "$ARCH" == "ppc" ]]; then + oprofile_calc +else + calibrator_calc +fi + +echo TLB_MISS_COST=$LAST_LATENCY_CYCLES + +# Save for future reference +echo TLB_MISS_COST=$LAST_LATENCY_CYCLES 2> /dev/null > /etc/tlbmisscost.conf +if [ "$HOSTNAME" != "" -a "$HOSTNAME" != "localhost" ]; then + grep -v $HOSTNAME:TLB_MISS_COST $HOME/.tlbmisscostrc > $HOME/.tlbmisscostrc.$$ 2> /dev/null + echo $HOSTNAME:TLB_MISS_COST=$LAST_LATENCY_CYCLES >> $HOME/.tlbmisscostrc.$$ + mv $HOME/.tlbmisscostrc.$$ $HOME/.tlbmisscostrc +fi + +exit 0 diff --git a/default/libhugetlbfs/libhugetlbfs/cpupcstat b/default/libhugetlbfs/libhugetlbfs/cpupcstat new file mode 100755 index 0000000..e6b44c3 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/cpupcstat @@ -0,0 +1,337 @@ +#!/usr/bin/perl -w +# This script starts a requested application after setting up oprofile to +# collect TLB miss data. It will use this data to calculate the TLB +# apporximate TLB miss rate. +# Licensed under LGPL 2.1 as packaged with libhugetlbfs +# (c) Eric Munson 2009 + +use Getopt::Long; +use FindBin qw($Bin); +use lib "$Bin"; +use POSIX ":sys_wait_h"; +use TLBC::OpCollect; +use TLBC::PerfCollect; +use strict; + +my ($arch, $cputype); +my $vmlinux; +my $target; +my $real_target; +my $target_pid; +my $target_global; +my $misses; +my $instructions = 0; +my $cycles = 0; +my $kern_misses; +my $time_elapsed; +my $wait_time = 10; +my $time_limit; +my $persist = 0; +my $instruct_ratio; +my $cycle_ratio; +my $service; +my $config; +my $cost_in_cycles = 0; +my $kernel; +my $force_oprofile; +my $collector; +my $miss_scale = 0; +my $ins_scale = 0; +my $cyc_scale = 0; + +sub calc_tlbmiss_cost() +{ + my $cost_script = `which tlbmiss_cost.sh`; + if ($cost_script eq "") { + $cost_script = "$Bin/contrib/tlbmiss_cost.sh"; + } + my $data = `$cost_script --vmlinux $vmlinux`; + ($data,$cost_in_cycles) = split(/\=/, $data); + chomp($cost_in_cycles); +} + +sub start_target() +{ + my $pid = fork(); + if (not defined $pid) { + die "Failed to fork\n"; + } elsif ($pid == 0) { + exec $target or die "Failed to exec '$target'\n"; + } else { + return($pid); + } +} + +sub run_profile() +{ + my $start_time; + my $end_time; + my @results; + my $binName; + my $pid; + my $ret; + my $prev = 0; + my $kern_prev = 0; + my $ins_new = 0; + my $ins_prev = 0; + my $cyc_new = 0; + my $cyc_prev = 0; + my $new; + my @events; + + if ($force_oprofile) { + $collector = TLBC::OpCollect->new(); + } else { + $collector = TLBC::PerfCollect->new(); + } + + push(@events, "dtlb_miss"); + if ($instruct_ratio) { + push(@events, "instructions"); + } + if ($cycle_ratio || $service) { + push(@events, "timer"); + } + + $start_time = time(); + + if ($collector->setup($vmlinux, \@events) == 0) { + $collector = TLBC::OpCollect->new(); + if ($force_oprofile || + $collector->setup($vmlinux, \@events) == 0) { + die("Unable to setup data collector"); + } + } + + if (defined $target_pid) { + $target = readlink("/proc/$target_pid/exe"); + chomp($target); + $binName = $target; + $pid = $target_pid; + } elsif (defined $target) { + if (defined $real_target) { + $binName = $real_target; + } else { + @results = split(/ /, $target); + $binName = $results[0]; + } + $pid = start_target(); + } elsif (defined $target_global) { + $binName='/'; + $pid = $$; + } + + $binName = `basename $binName`; + chomp($binName); + + printf("%15s%18s%19s", "Target Name", "DTLB Miss Samples", + "Samples/second"); + + $miss_scale = $collector->samples("dtlb_miss"); + if ($instruct_ratio) { + printf("%24s\n", "Instructions/TLB Miss\n"); + $ins_scale = $collector->samples("instructions"); + } elsif ($cycle_ratio) { + printf("%24s\n", "Cycles/TLB Miss\n"); + $cyc_scale = $collector->samples("timer"); + } elsif ($service) { + printf("%24s\n", "TLB Miss %age Time\n"); + $cyc_scale = $collector->samples("timer"); + } else { + print("\n"); + } + + printf("%15s%18s%19s\n", "", "Sample every " . $collector->samples("dtlb_miss"), ""); + sleep($wait_time); + + # While our target is still running and we have not exceeded our + # runtime, collect oprofile data every $wait_time seconds to display + # the dtlb miss rate. + while (waitpid($pid, WNOHANG) <= 0 || $persist) { + $collector->read_eventcount(); + $ret = $collector->get_current_eventcount($binName, "dtlb_miss"); + $new = $ret - $prev; + printf("%15s%18d%19f", $binName, $new, $new / $wait_time); + $prev = $ret; + + if ($instruct_ratio) { + $ret = $collector->get_current_eventcount($binName, + "instructions"); + $ins_new = $ret - $ins_prev; + if ($new == 0) { + printf("%24f\n", $new); + } else { + printf("%24f\n", + ($ins_new * $ins_scale) / ($new * $miss_scale)); + } + $ins_prev = $ret; + } elsif ($cycle_ratio) { + $ret = $collector->get_current_eventcount($binName, + "timer"); + $cyc_new = $ret - $cyc_prev; + if ($new == 0) { + printf("%24f\n", $new); + } else { + printf("%24f\n", + ($cyc_new * $cyc_scale) / ($new * $miss_scale)); + } + $cyc_prev = $ret; + } elsif ($service) { + + $ret = $collector->get_current_eventcount($binName, + "timer"); + $cyc_new = $ret - $cyc_prev; + my $miss_cycles = $new * $cost_in_cycles * $miss_scale; + my $total_cycles = $cyc_new * $cyc_scale; + + printf "%24.4f%%\n", $miss_cycles * 100/$total_cycles; + + $cyc_prev = $ret; + } else { + print("\n"); + } + if ($kernel) { + $ret = $collector->get_current_eventcount("vmlinux", "dtlb_miss"); + $new = $ret - $kern_prev; + printf("%15s%18d%19f\n", "vmlinux", $new, + $new / $wait_time); + $kern_prev = $ret; + } + $end_time = time(); + $time_elapsed = $end_time - $start_time; + if (defined $time_limit && $time_elapsed > $time_limit) { + last; + } + sleep($wait_time); + } + $end_time = time(); + $time_elapsed = $end_time - $start_time; + $collector->read_eventcount(); + $misses = $collector->get_current_eventcount($binName, "dtlb_miss"); + if ($instruct_ratio) { + $instructions = $collector->get_current_eventcount($binName, "instructions"); + } + if ($cycle_ratio || $service) { + $cycles = $collector->get_current_eventcount($binName, "timer"); + } + + if ($kernel) { + $kern_misses = $collector->get_current_eventcount("vmlinux", "dtlb_miss"); + } + + $collector->shutdown(); +} + +sub get_target() +{ + $target .= $_[0] . " "; +} + +sub print_usage() +{ + print "Usage: cpupcstat [options] target + Options: + --vmlinux /path/to/vmlinux Sets the vmlinux file to use + --delay N Waits N seconds before rereading the + miss rate + --target-global Watch the miss rate of all processes + --target-pid P Watch the miss rate of P instead of a target + --real-target T Watch T instead of target in case target is + a launcher script + --time-limit L Sets a time limit for watching the target + --kernel Output DTLB miss data for the kernel as well + as the specified target + --time-servicing Print the percentage of time servicing TLB + misses + --misses-per-instruction Prints the ratio of TLB misses per + instruction retired + --misses-per-cycle Prints the ratio of TLB misses per CPU cycle + --force-oprofile The perf tool is prefered for data + collection with oprofile as the fall back, + force oprofile usage instead + --help prints this message + + Note: If --target-pid is specified, target will be ignored.\n"; + exit(0); +} + +sub exit_cleanup() +{ + my $collector = TLBC::OpCollect->new(); + $collector->shutdown(); + exit(0); +} +use sigtrap 'handler' => \&exit_cleanup, 'INT'; + +Getopt::Long::Configure ('bundling'); +GetOptions ('v|vmlinux=s' => \$vmlinux, + 'h|help' => \&print_usage, + 'd|delay=i' => \$wait_time, + 'g|target-global' => \$target_global, + 'p|target-pid=i' => \$target_pid, + 'r|real-target=s' => \$real_target, + 'l|time-limit=i' => \$time_limit, + 'k|kernel' => \$kernel, + 'i|misses-per-instruction' => \$instruct_ratio, + 'c|misses-per-cycle' => \$cycle_ratio, + 't|time-servicing' => \$service, + 'C|cost-config=s' => \$config, + 'o|force-oprofile' => \$force_oprofile, + 's|persist' => \$persist, + '<>' => \&get_target); + +if (!$target && !$target_global && not defined $target_pid) { + print_usage(); +} + +if (!$vmlinux) { + $vmlinux = "/boot/vmlinux-" . `uname -r`; +} + +chomp($vmlinux); +if ($target) { + chomp($target); +} + +if ($service) { + calc_tlbmiss_cost(); +} + +$misses = 0; +$kern_misses = 0; +run_profile(); + +if ($misses > 0) { + print("\n$target saw $misses total DTLB miss samples over ", + "$time_elapsed seconds\n"); + print("at rate of ", $misses / $time_elapsed, " samples/second\n"); + $misses *= $miss_scale; + $cycles *= $cyc_scale; + $instructions *= $ins_scale; + + if ($instruct_ratio && $instructions > 0) { + print("The ratio of instructions retired per TLB miss was ", + $instructions / $misses, "\n"); + } + if ($cycle_ratio && $cycles > 0) { + print("The ratio of cycles per TLB miss was ", + $cycles / $misses, "\n"); + } + + if ($service && $cycles > 0) { + if ($cost_in_cycles <= 0) { + calc_tlbmiss_cost(); + } + my $total_cost = $cost_in_cycles * $misses; + print("$target spent ", + $total_cost / $cycles * 100, + "% of its CPU cycles servicing\nTLB misses\n"); + } +} + +if ($kern_misses > 0) { + print("The kernel saw $kern_misses total DTLB miss samples over ", + "$time_elapsed seconds\n"); + print("at rate of ", $kern_misses / $time_elapsed, " samples/second\n"); +} + diff --git a/default/libhugetlbfs/libhugetlbfs/debug.c b/default/libhugetlbfs/libhugetlbfs/debug.c new file mode 100644 index 0000000..6bc7b76 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/debug.c @@ -0,0 +1,50 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> + +#include "hugetlbfs.h" + +#include "libhugetlbfs_internal.h" + +int __hugetlbfs_verbose = VERBOSITY_DEFAULT; +bool __hugetlbfs_debug = false; +bool __hugetlbfs_prefault = true; +char __hugetlbfs_hostname[64]; + +static int initialized; + +static void __hugetlbfs_init_debug(void) +{ + if (initialized) + return; + + gethostname(__hugetlbfs_hostname, sizeof(__hugetlbfs_hostname)-1); + + initialized = 1; +} + +void hugetlbfs_setup_debug(void) +{ + __hugetlbfs_init_debug(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/elf32ppclinux.c b/default/libhugetlbfs/libhugetlbfs/elf32ppclinux.c new file mode 100644 index 0000000..24adaf1 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/elf32ppclinux.c @@ -0,0 +1,54 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <elf.h> +#include <link.h> + +#include "libhugetlbfs_internal.h" + +/* + * The powerpc 32-bit ELF ABI defines the location and size of the plt as + * follows (see the ELF ABI and powerpc32 supplement for details): + * + * Location: (data segment p_vaddr) + (data segment p_filesz) + * Size: (dynamic symbol table DT_PTRELSZ entry) + 72 + * + * plt entries have likely been initialized when the libhugetlbfs remapping + * code runs, we must copy these entries when preparing the data segment. Tell + * the arch-independent code how many bytes to copy. + */ +ElfW(Word) plt_extrasz(ElfW(Dyn) *dyntab) +{ + int i; + ElfW(Word) pltrelsz = 0; + + /* Find the needed information in the dynamic section */ + for (i = 0; dyntab[i].d_tag != DT_NULL; i++) + if (dyntab[i].d_tag == DT_PLTRELSZ) + pltrelsz = dyntab[i].d_un.d_val; + + /* pltrelsz indicates the size of all plt entries used to cache + * symbol lookups, but does not include the reserved entry at PLT[0]. + * 72 bytes is the ABI-defined size of a plt entry. + */ + if (pltrelsz) + return pltrelsz + 72; + else + return 0; +} diff --git a/default/libhugetlbfs/libhugetlbfs/elf64ppc.c b/default/libhugetlbfs/libhugetlbfs/elf64ppc.c new file mode 100644 index 0000000..8c86fca --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/elf64ppc.c @@ -0,0 +1,54 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <elf.h> +#include <link.h> + +#include "libhugetlbfs_internal.h" + +/* + * The powerpc 64-bit ELF ABI defines the location and size of the plt as + * follows (see the ELF ABI and powerpc64 supplement for details): + * + * Location: (data segment p_vaddr) + (data segment p_filesz) + * Size: (dynamic symbol table DT_PTRELSZ entry) + 24 + * + * plt entries have likely been initialized when the libhugetlbfs remapping + * code runs, we must copy these entries when preparing the data segment. Tell + * the arch-independent code how many bytes to copy. + */ +ElfW(Word) plt_extrasz(ElfW(Dyn) *dyntab) +{ + int i; + ElfW(Word) pltrelsz = 0; + + /* Find the needed information in the dynamic section */ + for (i = 0; dyntab[i].d_tag != DT_NULL; i++) + if (dyntab[i].d_tag == DT_PLTRELSZ) + pltrelsz = dyntab[i].d_un.d_val; + + /* pltrelsz indicates the size of all plt entries used to cache + * symbol lookups, but does not include the reserved entry at PLT[0]. + * 24 bytes is the ABI-defined size of a plt entry. + */ + if (pltrelsz) + return pltrelsz + 24; + else + return 0; +} diff --git a/default/libhugetlbfs/libhugetlbfs/elflink.c b/default/libhugetlbfs/libhugetlbfs/elflink.c new file mode 100644 index 0000000..c24bedc --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/elflink.c @@ -0,0 +1,1333 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <link.h> +#include <malloc.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <signal.h> +#include <sys/syscall.h> +#include <sys/file.h> +#include <linux/unistd.h> +#include <sys/mman.h> +#include <sys/wait.h> +#include <sys/stat.h> +#include <errno.h> +#include <limits.h> +#include <elf.h> +#include <dlfcn.h> + +#include "version.h" +#include "hugetlbfs.h" +#include "libhugetlbfs_internal.h" + +#ifdef __LP64__ +#define Elf_Ehdr Elf64_Ehdr +#define Elf_Phdr Elf64_Phdr +#define Elf_Dyn Elf64_Dyn +#define Elf_Sym Elf64_Sym +#define ELF_ST_BIND(x) ELF64_ST_BIND(x) +#define ELF_ST_TYPE(x) ELF64_ST_TYPE(x) +#else +#define Elf_Ehdr Elf32_Ehdr +#define Elf_Phdr Elf32_Phdr +#define Elf_Dyn Elf32_Dyn +#define Elf_Sym Elf32_Sym +#define ELF_ST_BIND(x) ELF64_ST_BIND(x) +#define ELF_ST_TYPE(x) ELF64_ST_TYPE(x) +#endif + +/* + * SHARED_TIMEOUT is used by find_or_prepare_shared_file for when it + * should timeout while waiting for other users to finish preparing + * the file it wants. The value is the number of tries before giving + * up with a 1 second wait between tries + */ +#define SHARED_TIMEOUT 10 + +/* This function prints an error message to stderr, then aborts. It + * is safe to call, even if the executable segments are presently + * unmapped. + * + * Arguments are printf() like, but at present supports only %d and %p + * with no modifiers + * + * FIXME: This works in practice, but I suspect it + * is not guaranteed safe: the library functions we call could in + * theory call other functions via the PLT which will blow up. */ +static void write_err(const char *start, int len) +{ + direct_syscall(__NR_write, 2 /*stderr*/, start, len); +} +static void sys_abort(void) +{ + pid_t pid = direct_syscall(__NR_getpid); + + direct_syscall(__NR_kill, pid, SIGABRT); +} +static void write_err_base(unsigned long val, int base) +{ + const char digit[] = "0123456789abcdef"; + char str1[sizeof(val)*8]; + char str2[sizeof(val)*8]; + int len = 0; + int i; + + str1[0] = '0'; + while (val) { + str1[len++] = digit[val % base]; + val /= base; + } + + if (len == 0) + len = 1; + + /* Reverse digits */ + for (i = 0; i < len; i++) + str2[i] = str1[len-i-1]; + + write_err(str2, len); +} + +static void unmapped_abort(const char *fmt, ...) +{ + const char *p, *q; + int done = 0; + unsigned long val; + va_list ap; + + /* World's worst printf()... */ + va_start(ap, fmt); + p = q = fmt; + while (! done) { + switch (*p) { + case '\0': + write_err(q, p-q); + done = 1; + break; + + case '%': + write_err(q, p-q); + p++; + switch (*p) { + case 'u': + val = va_arg(ap, unsigned); + write_err_base(val, 10); + p++; + break; + case 'p': + val = (unsigned long)va_arg(ap, void *); + write_err_base(val, 16); + p++; + break; + } + q = p; + break; + default: + p++; + } + } + + va_end(ap); + + sys_abort(); +} + +/* The directory to use for sharing readonly segments */ +static char share_readonly_path[PATH_MAX+1]; + +#define MAX_HTLB_SEGS 3 +#define MAX_SEGS 10 + +struct seg_info { + void *vaddr; + unsigned long filesz, memsz, extrasz; + int prot; + int fd; + int index; + long page_size; +}; + +struct seg_layout { + unsigned long start, end; + long page_size; +}; + +static struct seg_info htlb_seg_table[MAX_HTLB_SEGS]; +static int htlb_num_segs; +static unsigned long force_remap; /* =0 */ +static long hpage_readonly_size, hpage_writable_size; + +/** + * assemble_path - handy wrapper around snprintf() for building paths + * @dst: buffer of size PATH_MAX+1 to assemble string into + * @fmt: format string for path + * @...: printf() style parameters for path + * + * assemble_path() builds a path in the target buffer (which must have + * PATH_MAX+1 available bytes), similar to sprintf(). However, f the + * assembled path would exceed PATH_MAX characters in length, + * assemble_path() prints an error and abort()s, so there is no need + * to check the return value and backout. + */ +static void assemble_path(char *dst, const char *fmt, ...) +{ + va_list ap; + int len; + + va_start(ap, fmt); + len = vsnprintf(dst, PATH_MAX+1, fmt, ap); + va_end(ap); + + if (len < 0) { + ERROR("vsnprintf() error\n"); + abort(); + } + + if (len > PATH_MAX) { + ERROR("Overflow assembling path\n"); + abort(); + } +} + +static void check_memsz() +{ + int i; + unsigned long memsz_total = 0, memsz_max = 0; + if (htlb_num_segs == 0) + return; + /* + * rough heuristic to see if we'll run out of address + * space + */ + for (i = 0; i < htlb_num_segs; i++) { + memsz_total += htlb_seg_table[i].memsz; + if (htlb_seg_table[i].memsz > memsz_max) + memsz_max = htlb_seg_table[i].memsz; + } + /* avoid overflow checking by using two checks */ + DEBUG("Total memsz = %#0lx, memsz of largest segment = %#0lx\n", + memsz_total, memsz_max); +} + +/** + * find_or_create_share_path - obtain a directory to store the shared + * hugetlbfs files + * + * Checks environment and filesystem to locate a suitable directory + * for shared hugetlbfs files, creating a new directory if necessary. + * The determined path is stored in global variable share_readonly_path. + * + * returns: + * -1, on error + * 0, on success + */ +static int find_or_create_share_path(long page_size) +{ + const char *base_path; + struct stat sb; + int ret; + + /* If no remaping is planned for the read-only segments we are done */ + if (!page_size) + return 0; + + if (__hugetlb_opts.share_path) { + /* Given an explicit path */ + if (hugetlbfs_test_path(__hugetlb_opts.share_path) != 1) { + WARNING("HUGETLB_SHARE_PATH %s is not on a hugetlbfs" + " filesystem\n", __hugetlb_opts.share_path); + return -1; + } + + /* Make sure the page size matches */ + if (page_size != + hugetlbfs_test_pagesize(__hugetlb_opts.share_path)) { + WARNING("HUGETLB_SHARE_PATH %s is not valid for a %li " + "kB page size\n", __hugetlb_opts.share_path, + page_size / 1024); + return -1; + } + assemble_path(share_readonly_path, "%s", + __hugetlb_opts.share_path); + return 0; + } + + base_path = hugetlbfs_find_path_for_size(page_size); + if (!base_path) + return -1; + + assemble_path(share_readonly_path, "%s/elflink-uid-%d", + base_path, getuid()); + + ret = mkdir(share_readonly_path, 0700); + if ((ret != 0) && (errno != EEXIST)) { + WARNING("Error creating share directory %s\n", + share_readonly_path); + return -1; + } + + /* Check the share directory is sane */ + ret = lstat(share_readonly_path, &sb); + if (ret != 0) { + WARNING("Couldn't stat() %s: %s\n", share_readonly_path, + strerror(errno)); + return -1; + } + + if (! S_ISDIR(sb.st_mode)) { + WARNING("%s is not a directory\n", share_readonly_path); + return -1; + } + + if (sb.st_uid != getuid()) { + WARNING("%s has wrong owner (uid=%d instead of %d)\n", + share_readonly_path, sb.st_uid, getuid()); + return -1; + } + + if (sb.st_mode & (S_IWGRP | S_IWOTH)) { + WARNING("%s has bad permissions 0%03o\n", + share_readonly_path, sb.st_mode); + return -1; + } + + return 0; +} + +/* + * Look for non-zero BSS data inside a range and print out any matches + */ + +static void check_bss(unsigned long *start, unsigned long *end) +{ + unsigned long *addr; + + for (addr = start; addr < end; addr++) { + if (*addr != 0) + DEBUG("Non-zero BSS data @ %p: %lx\n", addr, *addr); + } +} + +/** + * get_shared_file_name - create a shared file name from program name, + * segment number and current word size + * @htlb_seg_info: pointer to program's segment data + * @file_path: pointer to a PATH_MAX+1 array to store filename in + * + * The file name created is *not* intended to be unique, except when + * the name, gid or phdr number differ. The goal here is to have a + * standard means of accessing particular segments of particular + * executables. + * + * returns: + * -1, on failure + * 0, on success + */ +static int get_shared_file_name(struct seg_info *htlb_seg_info, char *file_path) +{ + int ret; + char binary[PATH_MAX+1]; + char *binary2; + + memset(binary, 0, sizeof(binary)); + ret = readlink("/proc/self/exe", binary, PATH_MAX); + if (ret < 0) { + WARNING("shared_file: readlink() on /proc/self/exe " + "failed: %s\n", strerror(errno)); + return -1; + } + + binary2 = basename(binary); + if (!binary2) { + WARNING("shared_file: basename() on %s failed: %s\n", + binary, strerror(errno)); + return -1; + } + + assemble_path(file_path, "%s/%s_%zd_%d", share_readonly_path, binary2, + sizeof(unsigned long) * 8, htlb_seg_info->index); + + return 0; +} + +/* Find the .dynamic program header */ +static int find_dynamic(Elf_Dyn **dyntab, const Elf_Phdr *phdr, int phnum) +{ + int i = 1; + + while ((phdr[i].p_type != PT_DYNAMIC) && (i < phnum)) { + ++i; + } + if (phdr[i].p_type == PT_DYNAMIC) { + *dyntab = (Elf_Dyn *)phdr[i].p_vaddr; + return 0; + } else { + DEBUG("No dynamic segment found\n"); + return -1; + } +} + +/* Find the dynamic string and symbol tables */ +static int find_tables(Elf_Dyn *dyntab, Elf_Sym **symtab, char **strtab) +{ + int i = 1; + while ((dyntab[i].d_tag != DT_NULL)) { + if (dyntab[i].d_tag == DT_SYMTAB) + *symtab = (Elf_Sym *)dyntab[i].d_un.d_ptr; + else if (dyntab[i].d_tag == DT_STRTAB) + *strtab = (char *)dyntab[i].d_un.d_ptr; + i++; + } + + if (!*symtab) { + DEBUG("No symbol table found\n"); + return -1; + } + if (!*strtab) { + DEBUG("No string table found\n"); + return -1; + } + return 0; +} + +/* Find the number of symbol table entries */ +static int find_numsyms(Elf_Sym *symtab, char *strtab) +{ + /* + * WARNING - The symbol table size calculation does not follow the ELF + * standard, but rather exploits an assumption we enforce in + * our linker scripts that the string table follows + * immediately after the symbol table. The linker scripts + * must maintain this assumption or this code will break. + */ + if ((void *)strtab <= (void *)symtab) { + DEBUG("Could not calculate dynamic symbol table size\n"); + return -1; + } + return ((void *)strtab - (void *)symtab) / sizeof(Elf_Sym); +} + +/* + * To reduce the size of the extra copy window, we can eliminate certain + * symbols based on information in the dynamic section. The following + * characteristics apply to symbols which may require copying: + * - Within the BSS + * - Global or Weak binding + * - Object type (variable) + * - Non-zero size (zero size means the symbol is just a marker with no data) + */ +static inline int keep_symbol(char *strtab, Elf_Sym *s, void *start, void *end) +{ + if ((void *)s->st_value < start) + return 0; + if ((void *)s->st_value > end) + return 0; + if ((ELF_ST_BIND(s->st_info) != STB_GLOBAL) && + (ELF_ST_BIND(s->st_info) != STB_WEAK)) + return 0; + if (ELF_ST_TYPE(s->st_info) != STT_OBJECT) + return 0; + if (s->st_size == 0) + return 0; + + if (__hugetlbfs_debug) + DEBUG("symbol to copy at %p: %s\n", (void *)s->st_value, + strtab + s->st_name); + + return 1; +} + +/* If unspecified by the architecture, no extra copying of the plt is needed */ +ElfW(Word) __attribute__ ((weak)) plt_extrasz(ElfW(Dyn) *dyntab) +{ + return 0; +} + +/* + * Subtle: Since libhugetlbfs depends on glibc, we allow it + * it to be loaded before us. As part of its init functions, it + * initializes stdin, stdout, and stderr in the bss. We need to + * include these initialized variables in our copy. + */ + +static void get_extracopy(struct seg_info *seg, const Elf_Phdr *phdr, int phnum) +{ + Elf_Dyn *dyntab; /* dynamic segment table */ + Elf_Sym *symtab = NULL; /* dynamic symbol table */ + Elf_Sym *sym; /* a symbol */ + char *strtab = NULL; /* string table for dynamic symbols */ + int ret, numsyms, found_sym = 0; + void *start, *end, *end_orig; + void *sym_start, *sym_end; + void *plt_end; + + end_orig = seg->vaddr + seg->memsz; + start = seg->vaddr + seg->filesz; + if (seg->filesz == seg->memsz) + return; + if (!__hugetlb_opts.min_copy) + goto bail2; + + /* Find dynamic program header */ + ret = find_dynamic(&dyntab, phdr, phnum); + if (ret < 0) + goto bail; + + /* Find symbol and string tables */ + ret = find_tables(dyntab, &symtab, &strtab); + if (ret < 0) + goto bail; + + numsyms = find_numsyms(symtab, strtab); + if (numsyms < 0) + goto bail; + + /* + * We must ensure any returns done hereafter have sane start and end + * values, as the criss-cross apple sauce algorithm is beginning + */ + end = start; + + for (sym = symtab; sym < symtab + numsyms; sym++) { + if (!keep_symbol(strtab, sym, start, end_orig)) + continue; + + /* These are the droids we are looking for */ + found_sym = 1; + sym_start = (void *)sym->st_value; + sym_end = (void *)(sym->st_value + sym->st_size); + if (sym_end > end) + end = sym_end; + } + + /* + * Some platforms (PowerPC 64bit ELF) place their PLT beyond the filesz + * part of the data segment. When this is the case, we must extend the + * copy window to include this data which has been initialized by the + * run-time linker. + */ + plt_end = start + plt_extrasz(dyntab); + if (plt_end > end) { + end = plt_end; + found_sym = 1; + } + + if (__hugetlbfs_debug) + check_bss(end, end_orig); + + if (found_sym) { + seg->extrasz = end - start; + } + /* + * else no need to copy anything, so leave seg->extrasz as zero + */ + return; + +bail: + DEBUG("Unable to perform minimal copy\n"); +bail2: + seg->extrasz = end_orig - start; +} + +#if defined(__powerpc64__) || defined (__powerpc__) +#define SLICE_LOW_TOP (0x100000000UL) +#define SLICE_LOW_SIZE (1UL << SLICE_LOW_SHIFT) +#define SLICE_HIGH_SIZE (1UL << SLICE_HIGH_SHIFT) +#endif + +/* + * Return the address of the start and end of the hugetlb slice + * containing @addr. A slice is a range of addresses, start inclusive + * and end exclusive. + * Note, that since relinking is not supported on ia64, we can leave it + * out here. + */ +static unsigned long hugetlb_slice_start(unsigned long addr) +{ +#if defined(__powerpc64__) + if (addr < SLICE_LOW_TOP) + return ALIGN_DOWN(addr, SLICE_LOW_SIZE); + else if (addr < SLICE_HIGH_SIZE) + return SLICE_LOW_TOP; + else + return ALIGN_DOWN(addr, SLICE_HIGH_SIZE); +#elif defined(__powerpc__) + return ALIGN_DOWN(addr, SLICE_LOW_SIZE); +#else + return ALIGN_DOWN(addr, gethugepagesize()); +#endif +} + +static unsigned long hugetlb_slice_end(unsigned long addr) +{ +#if defined(__powerpc64__) + if (addr < SLICE_LOW_TOP) + return ALIGN_UP(addr, SLICE_LOW_SIZE) - 1; + else + return ALIGN_UP(addr, SLICE_HIGH_SIZE) - 1; +#elif defined(__powerpc__) + return ALIGN_UP(addr, SLICE_LOW_SIZE) - 1; +#else + return ALIGN_UP(addr, gethugepagesize()) - 1; +#endif +} + +static unsigned long hugetlb_next_slice_start(unsigned long addr) +{ + return hugetlb_slice_end(addr) + 1; +} + +static unsigned long hugetlb_prev_slice_end(unsigned long addr) +{ + return hugetlb_slice_start(addr) - 1; +} + +/* + * Store a copy of the given program header + */ +static int save_phdr(int table_idx, int phnum, const ElfW(Phdr) *phdr) +{ + int prot = 0; + + if (table_idx >= MAX_HTLB_SEGS) { + WARNING("Executable has too many segments (max %d)\n", + MAX_HTLB_SEGS); + htlb_num_segs = 0; + return -1; + } + + if (phdr->p_flags & PF_R) + prot |= PROT_READ; + if (phdr->p_flags & PF_W) + prot |= PROT_WRITE; + if (phdr->p_flags & PF_X) + prot |= PROT_EXEC; + + htlb_seg_table[table_idx].vaddr = (void *) phdr->p_vaddr; + htlb_seg_table[table_idx].filesz = phdr->p_filesz; + htlb_seg_table[table_idx].memsz = phdr->p_memsz; + htlb_seg_table[table_idx].prot = prot; + htlb_seg_table[table_idx].index = phnum; + + INFO("Segment %d (phdr %d): %#0lx-%#0lx (filesz=%#0lx) " + "(prot = %#0x)\n", table_idx, phnum, + (unsigned long) phdr->p_vaddr, + (unsigned long) phdr->p_vaddr + phdr->p_memsz, + (unsigned long) phdr->p_filesz, (unsigned int) prot); + + return 0; +} + +static int verify_segment_layout(struct seg_layout *segs, int num_segs) +{ + int i; + long base_size = getpagesize(); + + for (i = 1; i < num_segs; i++) { + unsigned long prev_end = segs[i - 1].end; + unsigned long start = segs[i].start; + + /* + * Do not worry about the boundary between segments that will + * not be remapped. + */ + if (segs[i - 1].page_size == base_size && + segs[i].page_size == base_size) + continue; + + /* Make sure alignment hasn't caused segments to overlap */ + if (prev_end > start) { + WARNING("Layout problem with segments %i and %i:\n\t" + "Segments would overlap\n", i - 1, i); + return 1; + } + + /* Make sure page size transitions occur on slice boundaries */ + if ((segs[i - 1].page_size != segs[i].page_size) && + hugetlb_slice_end(prev_end) > + hugetlb_slice_start(start)) { + WARNING("Layout problem with segments %i and %i:\n\t" + "Only one page size per slice\n", i - 1, i); + return 1; + } + } + return 0; +} + +static long segment_requested_page_size(const ElfW(Phdr) *phdr) +{ + int writable = phdr->p_flags & PF_W; + + /* Check if a page size was requested by the user */ + if (writable && hpage_writable_size) + return hpage_writable_size; + if (!writable && hpage_readonly_size) + return hpage_readonly_size; + + /* Check if this segment requests remapping by default */ + if (!hpage_readonly_size && !hpage_writable_size && + (phdr->p_flags & PF_LINUX_HUGETLB)) + return gethugepagesize(); + + /* No remapping selected, return the base page size */ + return getpagesize(); +} + +static +int parse_elf_normal(struct dl_phdr_info *info, size_t size, void *data) +{ + int i, num_segs; + unsigned long page_size, seg_psize, start, end; + struct seg_layout segments[MAX_SEGS]; + + page_size = getpagesize(); + num_segs = 0; + + for (i = 0; i < info->dlpi_phnum; i++) { + if (info->dlpi_phdr[i].p_type != PT_LOAD) + continue; + + if (i >= MAX_SEGS) { + WARNING("Maximum number of PT_LOAD segments" + "exceeded\n"); + return 1; + } + + seg_psize = segment_requested_page_size(&info->dlpi_phdr[i]); + if (seg_psize != page_size) { + if (save_phdr(htlb_num_segs, i, &info->dlpi_phdr[i])) + return 1; + get_extracopy(&htlb_seg_table[htlb_num_segs], + &info->dlpi_phdr[0], info->dlpi_phnum); + htlb_seg_table[htlb_num_segs].page_size = seg_psize; + htlb_num_segs++; + } + start = ALIGN_DOWN(info->dlpi_phdr[i].p_vaddr, seg_psize); + end = ALIGN(info->dlpi_phdr[i].p_vaddr + + info->dlpi_phdr[i].p_memsz, seg_psize); + + segments[num_segs].page_size = seg_psize; + segments[num_segs].start = start; + segments[num_segs].end = end; + num_segs++; + } + if (verify_segment_layout(segments, num_segs)) + htlb_num_segs = 0; + + if (__hugetlbfs_debug) + check_memsz(); + + return 1; +} + +/* + * Parse the phdrs of a normal program to attempt partial segment remapping + */ +static +int parse_elf_partial(struct dl_phdr_info *info, size_t size, void *data) +{ + unsigned long vaddr, memsz, gap; + unsigned long slice_end; + int i; + + /* This should never actually be called more than once in an + * iteration: we assume that dl_iterate_phdrs() always gives + * us the main program's phdrs on the first iteration, and + * always return 1 to cease iteration at that point. */ + + for (i = 0; i < info->dlpi_phnum; i++) { + if (info->dlpi_phdr[i].p_type != PT_LOAD) + continue; + + /* + * Partial segment remapping only makes sense if the + * memory size of the segment is larger than the + * granularity at which hugepages can be used. This + * mostly affects ppc, where the segment must be larger + * than 256M. This guarantees that remapping the binary + * in this forced way won't violate any contiguity + * constraints. + */ + vaddr = hugetlb_next_slice_start(info->dlpi_phdr[i].p_vaddr); + gap = vaddr - info->dlpi_phdr[i].p_vaddr; + slice_end = hugetlb_slice_end(vaddr); + /* + * we should stop remapping just before the slice + * containing the end of the memsz portion (taking away + * the gap of the memsz) + */ + memsz = info->dlpi_phdr[i].p_memsz; + if (memsz < gap) { + INFO("Segment %d's unaligned memsz is too small: " + "%#0lx < %#0lx\n", + i, memsz, gap); + continue; + } + memsz -= gap; + if (memsz < (slice_end - vaddr)) { + INFO("Segment %d's aligned memsz is too small: " + "%#0lx < %#0lx\n", + i, memsz, slice_end - vaddr); + continue; + } + memsz = hugetlb_prev_slice_end(vaddr + memsz) - vaddr; + + if (save_phdr(htlb_num_segs, i, &info->dlpi_phdr[i])) + return 1; + + /* + * When remapping partial segments, we create a sub-segment + * that is based on the original. For this reason, we must + * make some changes to the phdr captured by save_phdr(): + * vaddr is aligned upwards to a slice boundary + * memsz is aligned downwards to a slice boundary + * filesz is set to memsz to force all memory to be copied + */ + htlb_seg_table[htlb_num_segs].vaddr = (void *)vaddr; + htlb_seg_table[htlb_num_segs].filesz = memsz; + htlb_seg_table[htlb_num_segs].memsz = memsz; + + htlb_num_segs++; + } + return 1; +} + +/* + * Verify that a range of memory is unoccupied and usable + */ +static void check_range_empty(void *addr, unsigned long len) +{ + void *p; + + p = mmap(addr, len, PROT_READ, MAP_PRIVATE|MAP_ANON, 0, 0); + if (p != addr) { + WARNING("Unable to verify address range %p - %p. Not empty?\n", + addr, addr + len); + if (__hugetlbfs_debug) + dump_proc_pid_maps(); + } + if (p != MAP_FAILED) + munmap(p, len); +} + +/* + * Copy a program segment into a huge page. If possible, try to copy the + * smallest amount of data possible, unless the user disables this + * optimization via the HUGETLB_ELFMAP environment variable. + */ +static int prepare_segment(struct seg_info *seg) +{ + void *start, *p, *end, *new_end; + unsigned long size, offset; + long page_size = getpagesize(); + long hpage_size; + int mmap_reserve = __hugetlb_opts.no_reserve ? MAP_NORESERVE : 0; + + hpage_size = seg->page_size; + + /* + * mmaps must begin at an address aligned to the page size. If the + * vaddr of this segment is not hpage_size aligned, align it downward + * and begin the mmap there. Note the offset so we can copy data to + * the correct starting address within the temporary mmap. + */ + start = (void *) ALIGN_DOWN((unsigned long)seg->vaddr, hpage_size); + offset = seg->vaddr - start; + + /* + * Calculate the size of the temporary mapping we must create. + * This includes the offset (described above) and the filesz and + * extrasz portions of the segment (described below). We must align + * this total to the huge page size so it will be valid for mmap. + */ + size = ALIGN(offset + seg->filesz + seg->extrasz, hpage_size); + + /* + * If the segment's start or end addresses have been adjusted to align + * them to the hpage_size, check to make sure nothing is mapped in the + * padding before and after the segment. + */ + end = (void *) ALIGN((unsigned long)seg->vaddr + seg->memsz, page_size); + new_end = (void *) ALIGN((unsigned long)end, hpage_size); + if (offset) + check_range_empty(start, ALIGN_DOWN(offset, page_size)); + if (end != new_end) + check_range_empty(end, new_end - end); + + /* Create the temporary huge page mmap */ + p = mmap(NULL, size, PROT_READ|PROT_WRITE, + MAP_SHARED|mmap_reserve, seg->fd, 0); + if (p == MAP_FAILED) { + WARNING("Couldn't map hugepage segment to copy data: %s\n", + strerror(errno)); + return -1; + } + + /* + * Minimizing the amount of data copied will maximize performance. + * By definition, the filesz portion of the segment contains + * initialized data and must be copied. If part of the memsz portion + * is known to be initialized already, extrasz will be non-zero and + * that many addtional bytes will be copied from the beginning of the + * memsz region. The rest of the memsz is understood to be zeroes and + * need not be copied. + */ + INFO("Mapped hugeseg at %p. Copying %#0lx bytes and %#0lx extra bytes" + " from %p...", p, seg->filesz, seg->extrasz, seg->vaddr); + memcpy(p + offset, seg->vaddr, seg->filesz + seg->extrasz); + INFO_CONT("done\n"); + + munmap(p, size); + + return 0; +} + +/* + * [PPC] Prior to 2.6.22 (which added slices), our temporary hugepage + * mappings are placed in the segment before the stack. This 'taints' that + * segment for be hugepage-only for the lifetime of the process, resulting + * in a maximum stack size of 256MB. If we instead create our hugepage + * mappings in a child process, we can avoid this problem. + * + * This does not adversely affect non-PPC platforms so do it everywhere. + */ +static int fork_and_prepare_segment(struct seg_info *htlb_seg_info) +{ + int pid, ret, status; + + if ((pid = fork()) < 0) { + WARNING("fork failed"); + return -1; + } + if (pid == 0) { + ret = prepare_segment(htlb_seg_info); + if (ret < 0) { + WARNING("Failed to prepare segment\n"); + exit(1); + } + else + exit(0); + } + ret = waitpid(pid, &status, 0); + if (ret == -1) { + WARNING("waitpid failed"); + return -1; + } + + if (WEXITSTATUS(status) != 0) + return -1; + + INFO("Prepare succeeded\n"); + return 0; +} + +/** + * find_or_prepare_shared_file - get one shareable file + * @htlb_seg_info: pointer to program's segment data + * + * This function either locates a hugetlbfs file already containing + * data for a given program segment, or creates one if it doesn't + * already exist. + * + * We use the following algorithm to ensure that when processes race + * to instantiate the hugepage file, we will never obtain an + * incompletely prepared file or have multiple processes prepar + * separate copies of the file. + * - first open 'filename.tmp' with O_EXCL (this acts as a lockfile) + * - second open 'filename' with O_RDONLY (even if the first open + * succeeded). + * Then: + * - If both opens succeed, close the O_EXCL open, unlink + * filename.tmp and use the O_RDONLY fd. (Somebody else has prepared + * the file already) + * - If only the O_RDONLY open suceeds, and the O_EXCL open + * fails with EEXIST, just used the O_RDONLY fd. (Somebody else has + * prepared the file already, but we raced with their rename()). + * - If only the O_EXCL open suceeds, and the O_RDONLY fails with + * ENOENT, prepare the the O_EXCL open, then rename() filename.tmp to + * filename. (We're the first in, we have to prepare the file). + * - If both opens fail, with EEXIST and ENOENT, respectively, + * wait for a little while, then try again from the beginning + * (Somebody else is preparing the file, but hasn't finished yet) + * + * returns: + * -1, on failure + * 0, on success + */ +static int find_or_prepare_shared_file(struct seg_info *htlb_seg_info) +{ + int fdx = -1, fds; + int errnox, errnos; + int ret; + int i; + char final_path[PATH_MAX+1]; + char tmp_path[PATH_MAX+1]; + + ret = get_shared_file_name(htlb_seg_info, final_path); + if (ret < 0) + return -1; + assemble_path(tmp_path, "%s.tmp", final_path); + + for (i = 0; i < SHARED_TIMEOUT; i++) { + /* NB: mode is modified by umask */ + fdx = open(tmp_path, O_CREAT | O_EXCL | O_RDWR, 0666); + errnox = errno; + fds = open(final_path, O_RDONLY); + errnos = errno; + + if (fds >= 0) { + /* Got an already-prepared file -> use it */ + if (fdx > 0) { + /* Also got an exclusive file -> clean up */ + ret = unlink(tmp_path); + if (ret != 0) + WARNING("shared_file: unable to clean " + "up unneeded file %s: %s\n", + tmp_path, strerror(errno)); + close(fdx); + } else if (errnox != EEXIST) { + WARNING("shared_file: Unexpected failure on exclusive" + " open of %s: %s\n", tmp_path, + strerror(errnox)); + } + htlb_seg_info->fd = fds; + return 0; + } + + if (fdx >= 0) { + /* It's our job to prepare */ + if (errnos != ENOENT) + WARNING("shared_file: Unexpected failure on" + " shared open of %s: %s\n", final_path, + strerror(errnos)); + + htlb_seg_info->fd = fdx; + + INFO("Got unpopulated shared fd -- Preparing\n"); + ret = fork_and_prepare_segment(htlb_seg_info); + if (ret < 0) + goto fail; + + INFO("Prepare succeeded\n"); + /* move to permanent location */ + ret = rename(tmp_path, final_path); + if (ret != 0) { + WARNING("shared_file: unable to rename %s" + " to %s: %s\n", tmp_path, final_path, + strerror(errno)); + goto fail; + } + + return 0; + } + + /* Both opens failed, somebody else is still preparing */ + /* Wait and try again */ + sleep(1); + } + + fail: + if (fdx > 0) { + ret = unlink(tmp_path); + if (ret != 0) + WARNING("shared_file: Unable to clean up temp file %s " + "on failure: %s\n", tmp_path, strerror(errno)); + close(fdx); + } + + return -1; +} + +/** + * obtain_prepared_file - multiplex callers depending on if + * sharing or not + * @htlb_seg_info: pointer to program's segment data + * + * returns: + * -1, on error + * 0, on success + */ +static int obtain_prepared_file(struct seg_info *htlb_seg_info) +{ + int fd = -1; + int ret; + long hpage_size = htlb_seg_info->page_size; + + /* Share only read-only segments */ + if (__hugetlb_opts.sharing && !(htlb_seg_info->prot & PROT_WRITE)) { + /* first, try to share */ + ret = find_or_prepare_shared_file(htlb_seg_info); + if (ret == 0) + return 0; + /* but, fall through to unlinked files, if sharing fails */ + WARNING("Falling back to unlinked files\n"); + } + fd = hugetlbfs_unlinked_fd_for_size(hpage_size); + if (fd < 0) + return -1; + htlb_seg_info->fd = fd; + + return fork_and_prepare_segment(htlb_seg_info); +} + +static void remap_segments(struct seg_info *seg, int num) +{ + int i; + void *p; + unsigned long start, offset, mapsize; + long page_size = getpagesize(); + long hpage_size; + int mmap_flags; + + /* + * XXX: The bogus call to mmap below forces ld.so to resolve the + * mmap symbol before we unmap the plt in the data segment + * below. This might only be needed in the case where sharing + * is enabled and the hugetlbfs files have already been prepared + * by another process. + */ + p = mmap(0, 0, 0, 0, 0, 0); + + /* This is the hairy bit, between unmap and remap we enter a + * black hole. We can't call anything which uses static data + * (ie. essentially any library function...) + */ + for (i = 0; i < num; i++) { + start = ALIGN_DOWN((unsigned long)seg[i].vaddr, page_size); + offset = (unsigned long)(seg[i].vaddr - start); + mapsize = ALIGN(offset + seg[i].memsz, page_size); + munmap((void *) start, mapsize); + } + + /* Step 4. Rebuild the address space with hugetlb mappings */ + /* NB: we can't do the remap as hugepages within the main loop + * because of PowerPC: we may need to unmap all the normal + * segments before the MMU segment is ok for hugepages */ + for (i = 0; i < num; i++) { + hpage_size = seg[i].page_size; + start = ALIGN_DOWN((unsigned long)seg[i].vaddr, hpage_size); + offset = (unsigned long)(seg[i].vaddr - start); + mapsize = ALIGN(offset + seg[i].memsz, hpage_size); + mmap_flags = MAP_PRIVATE|MAP_FIXED; + + /* If requested, make no reservations */ + if (__hugetlb_opts.no_reserve) + mmap_flags |= MAP_NORESERVE; + + /* + * If this is a read-only mapping whose contents are + * entirely contained within the file, then use MAP_NORESERVE. + * The assumption is that the pages already exist in the + * page cache for the hugetlbfs file since it was prepared + * earlier and that mprotect() will not be called which would + * require a COW + */ + if (!(seg[i].prot & PROT_WRITE) && + seg[i].filesz == seg[i].memsz) + mmap_flags |= MAP_NORESERVE; + + p = mmap((void *) start, mapsize, seg[i].prot, + mmap_flags, seg[i].fd, 0); + if (p == MAP_FAILED) + unmapped_abort("Failed to map hugepage segment %u: " + "%p-%p (errno=%u)\n", i, start, + start + mapsize, errno); + if (p != (void *) start) + unmapped_abort("Mapped hugepage segment %u (%p-%p) at " + "wrong address %p\n", i, seg[i].vaddr, + seg[i].vaddr+mapsize, p); + } + /* The segments are all back at this point. + * and it should be safe to reference static data + */ +} + +static int set_hpage_sizes(const char *env) +{ + char *pos; + long size; + char *key; + char keys[5] = { "R\0" "W\0" "\0" }; + + /* For each key in R,W */ + for (key = keys; *key != '\0'; key += 2) { + pos = strcasestr(env, key); + if (!pos) + continue; + + if (*(++pos) == '=') { + size = parse_page_size(pos + 1); + if (size == -1) + return size; + } else + size = gethugepagesize(); + + if (size <= 0) { + if (errno == ENOSYS) + WARNING("Hugepages unavailable\n"); + else if (errno == EOVERFLOW) + WARNING("Hugepage size too large\n"); + else + WARNING("Hugepage size (%s)\n", + strerror(errno)); + size = 0; + } else if (!hugetlbfs_find_path_for_size(size)) { + WARNING("Hugepage size %li unavailable", size); + size = 0; + } + + if (*key == 'R') + hpage_readonly_size = size; + else + hpage_writable_size = size; + } + return 0; +} + +static int check_env(void) +{ + extern Elf_Ehdr __executable_start __attribute__((weak)); + + if (__hugetlb_opts.elfmap && + (strcasecmp(__hugetlb_opts.elfmap, "no") == 0)) { + INFO("HUGETLB_ELFMAP=%s, not attempting to remap program " + "segments\n", __hugetlb_opts.elfmap); + return -1; + } + if (__hugetlb_opts.elfmap && set_hpage_sizes(__hugetlb_opts.elfmap)) { + WARNING("Cannot set elfmap page sizes: %s", strerror(errno)); + return -1; + } + + if (__hugetlb_opts.ld_preload && + strstr(__hugetlb_opts.ld_preload, "libhugetlbfs")) { + if (__hugetlb_opts.force_elfmap) { + force_remap = 1; + INFO("HUGETLB_FORCE_ELFMAP=yes, " + "enabling partial segment " + "remapping for non-relinked " + "binaries\n"); + INFO("Disabling filesz copy optimization\n"); + __hugetlb_opts.min_copy = false; + } else { + if (&__executable_start) { + WARNING("LD_PRELOAD is incompatible with " + "segment remapping\n"); + WARNING("Segment remapping has been " + "DISABLED\n"); + return -1; + } + } + } + + if (__hugetlb_opts.sharing == 2) { + WARNING("HUGETLB_SHARE=%d, however sharing of writable\n" + "segments has been deprecated and is now disabled\n", + __hugetlb_opts.sharing); + __hugetlb_opts.sharing = 0; + } else { + INFO("HUGETLB_SHARE=%d, sharing ", __hugetlb_opts.sharing); + if (__hugetlb_opts.sharing == 1) { + INFO_CONT("enabled for only read-only segments\n"); + } else { + INFO_CONT("disabled\n"); + __hugetlb_opts.sharing = 0; + } + } + + INFO("HUGETLB_NO_RESERVE=%s, reservations %s\n", + __hugetlb_opts.no_reserve ? "yes" : "no", + __hugetlb_opts.no_reserve ? "disabled" : "enabled"); + + return 0; +} + +/* + * Parse an ELF header and record segment information for any segments + * which contain hugetlb information. + */ +static int parse_elf() +{ + if (force_remap) + dl_iterate_phdr(parse_elf_partial, NULL); + else + dl_iterate_phdr(parse_elf_normal, NULL); + + if (htlb_num_segs == 0) { + INFO("No segments were appropriate for remapping\n"); + return -1; + } + + return 0; +} + +void hugetlbfs_setup_elflink(void) +{ + int i, ret; + + if (check_env()) + return; + + if (parse_elf()) + return; + + INFO("libhugetlbfs version: %s\n", VERSION); + + /* Do we need to find a share directory */ + if (__hugetlb_opts.sharing) { + /* + * If HUGETLB_ELFMAP is undefined but a shareable segment has + * PF_LINUX_HUGETLB set, segment remapping will occur using the + * default huge page size. + */ + long page_size = hpage_readonly_size ? + hpage_readonly_size : gethugepagesize(); + + ret = find_or_create_share_path(page_size); + if (ret != 0) { + WARNING("Segment remapping is disabled"); + return; + } + } + + /* Step 1. Obtain hugepage files with our program data */ + for (i = 0; i < htlb_num_segs; i++) { + ret = obtain_prepared_file(&htlb_seg_table[i]); + if (ret < 0) { + WARNING("Failed to setup hugetlbfs file for segment " + "%d\n", i); + + /* Close files we have already prepared */ + for (; i >= 0; i--) + close(htlb_seg_table[i].fd); + + return; + } + } + + /* Step 3. Unmap the old segments, map in the new ones */ + remap_segments(htlb_seg_table, htlb_num_segs); +} diff --git a/default/libhugetlbfs/libhugetlbfs/huge_page_setup_helper.py b/default/libhugetlbfs/libhugetlbfs/huge_page_setup_helper.py new file mode 100755 index 0000000..8bfef14 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/huge_page_setup_helper.py @@ -0,0 +1,343 @@ +#!/usr/bin/python + +# +# Tool to set up Linux large page support with minimal effort +# +# by Jarod Wilson <jarod@xxxxxxxxxx> +# (c) Red Hat, Inc., 2009 +# +# Requires hugeadm from libhugetlbfs 2.7 (or backported support) +# +import os + +debug = False + +# must be executed under the root to operate +if os.geteuid() != 0: + print "You must be root to setup hugepages!" + os._exit(1) + +# config files we need access to +sysctlConf = "/etc/sysctl.conf" +if not os.access(sysctlConf, os.W_OK): + print "Cannot access %s" % sysctlConf + if debug == False: + os._exit(1) + +# This file will be created if it doesn't exist +limitsConf = "/etc/security/limits.d/hugepages.conf" + + +# Figure out what we've got in the way of memory +memTotal = 0 +hugePageSize = 0 +hugePages = 0 + +hugeadmexplain = os.popen("/usr/bin/hugeadm --explain 2>/dev/null").readlines() + +for line in hugeadmexplain: + if line.startswith("Total System Memory:"): + memTotal = int(line.split()[3]) + break + +if memTotal == 0: + print "Your version of libhugetlbfs' hugeadm utility is too old!" + os._exit(1) + + +# Pick the default huge page size and see how many pages are allocated +poolList = os.popen("/usr/bin/hugeadm --pool-list").readlines() +for line in poolList: + if line.split()[4] == '*': + hugePageSize = int(line.split()[0]) + hugePages = int(line.split()[2]) + break + +if hugePageSize == 0: + print "Aborting, cannot determine system huge page size!" + os._exit(1) + +# Get initial sysctl settings +shmmax = 0 +hugeGID = 0 + +for line in hugeadmexplain: + if line.startswith("A /proc/sys/kernel/shmmax value of"): + shmmax = int(line.split()[4]) + break + +for line in hugeadmexplain: + if line.strip().startswith("vm.hugetlb_shm_group = "): + hugeGID = int(line.split()[2]) + break + + +# translate group into textual version +hugeGIDName = "null" +groupNames = os.popen("/usr/bin/getent group").readlines() +for line in groupNames: + curGID = int(line.split(":")[2]) + if curGID == hugeGID: + hugeGIDName = line.split(":")[0] + break + + +# dump system config as we see it before we start tweaking it +print "Current configuration:" +print " * Total System Memory......: %6d MB" % memTotal +print " * Shared Mem Max Mapping...: %6d MB" % (shmmax / (1024 * 1024)) +print " * System Huge Page Size....: %6d MB" % (hugePageSize / (1024 * 1024)) +print " * Number of Huge Pages.....: %6d" % hugePages +print " * Total size of Huge Pages.: %6d MB" % (hugePages * hugePageSize / (1024 * 1024)) +print " * Remaining System Memory..: %6d MB" % (memTotal - (hugePages * hugePageSize / (1024 * 1024))) +print " * Huge Page User Group.....: %s (%d)" % (hugeGIDName, hugeGID) +print + + +# ask how memory they want to allocate for huge pages +userIn = None +while not userIn: + try: + userIn = raw_input("How much memory would you like to allocate for huge pages? " + "(input in MB, unless postfixed with GB): ") + if userIn[-2:] == "GB": + userHugePageReqMB = int(userIn[0:-2]) * 1024 + elif userIn[-1:] == "G": + userHugePageReqMB = int(userIn[0:-1]) * 1024 + elif userIn[-2:] == "MB": + userHugePageReqMB = int(userIn[0:-2]) + elif userIn[-1:] == "M": + userHugePageReqMB = int(userIn[0:-1]) + else: + userHugePageReqMB = int(userIn) + # As a sanity safeguard, require at least 128M not be allocated to huge pages + if userHugePageReqMB > (memTotal - 128): + userIn = None + print "Refusing to allocate %d, you must leave at least 128MB for the system" % userHugePageReqMB + elif userHugePageReqMB < (hugePageSize / (1024 * 1024)): + userIn = None + print "Sorry, allocation must be at least a page's worth!" + else: + break + except ValueError: + userIn = None + print "Input must be an integer, please try again!" +userHugePageReqKB = userHugePageReqMB * 1024 +userHugePagesReq = userHugePageReqKB / (hugePageSize / 1024) +print "Okay, we'll try to allocate %d MB for huge pages..." % userHugePageReqMB +print + + +# some basic user input validation +badchars = list(' \\\'":;~`!$^&*(){}[]?/><,') +inputIsValid = False +# ask for the name of the group allowed access to huge pages +while inputIsValid == False: + foundbad = False + userGroupReq = raw_input("What group should have access to the huge pages?" + "(The group will be created, if need be) [hugepages]: ") + if userGroupReq is '': + userGroupReq = 'hugepages' + if userGroupReq[0].isdigit() or userGroupReq[0] == "-": + foundbad = True + print "Group names cannot start with a number or dash, please try again!" + for char in badchars: + if char in userGroupReq: + foundbad = True + print "Illegal characters in group name, please try again!" + break + if len(userGroupReq) > 16: + foundbad = True + print "Group names can't be more than 16 characaters, please try again!" + if foundbad == False: + inputIsValid = True +print "Okay, we'll give group %s access to the huge pages" % userGroupReq + + +# see if group already exists, use it if it does, if not, create it +userGIDReq = -1 +for line in groupNames: + curGroupName = line.split(":")[0] + if curGroupName == userGroupReq: + userGIDReq = int(line.split(":")[2]) + break + +if userGIDReq > -1: + print "Group %s (gid %d) already exists, we'll use it" % (userGroupReq, userGIDReq) +else: + if debug == False: + os.popen("/usr/sbin/groupadd %s" % userGroupReq) + else: + print "/usr/sbin/groupadd %s" % userGroupReq + groupNames = os.popen("/usr/bin/getent group %s" % userGroupReq).readlines() + for line in groupNames: + curGroupName = line.split(":")[0] + if curGroupName == userGroupReq: + userGIDReq = int(line.split(":")[2]) + break + print "Created group %s (gid %d) for huge page use" % (userGroupReq, userGIDReq) +print + + +# basic user input validation, take 2 +# space is valid in this case, wasn't in the prior incarnation +badchars = list('\\\'":;~`!$^&*(){}[]?/><,') +inputIsValid = False +# ask for user(s) that should be in the huge page access group +while inputIsValid == False: + foundbad = False + userUsersReq = raw_input("What user(s) should have access to the huge pages (space-delimited list, users created as needed)? ") + for char in badchars: + if char in userUsersReq: + foundbad = True + print "Illegal characters in user name(s) or invalid list format, please try again!" + break + for n in userUsersReq.split(): + if len(n) > 32: + foundbad = True + print "User names can't be more than 32 characaters, please try again!" + break + if n[0] == "-": + foundbad = True + print "User names cannot start with a dash, please try again!" + break + if foundbad == False: + inputIsValid = True +# see if user(s) already exist(s) +curUserList = os.popen("/usr/bin/getent passwd").readlines() +hugePageUserList = userUsersReq.split() +for hugeUser in hugePageUserList: + userExists = False + for line in curUserList: + curUser = line.split(":")[0] + if curUser == hugeUser: + print "Adding user %s to huge page group" % hugeUser + userExists = True + if debug == False: + os.popen("/usr/sbin/usermod -a -G %s %s" % (userGroupReq, hugeUser)) + else: + print "/usr/sbin/usermod -a -G %s %s" % (userGroupReq, hugeUser) + if userExists == True: + break + if userExists == False: + print "Creating user %s with membership in huge page group" % hugeUser + if debug == False: + if hugeUser == userGroupReq: + os.popen("/usr/sbin/useradd %s -g %s" % (hugeUser, userGroupReq)) + else: + os.popen("/usr/sbin/useradd %s -G %s" % (hugeUser, userGroupReq)) + else: + print "/usr/sbin/useradd %s -G %s" % (hugeUser, userGroupReq) +print + + +# set values for the current running environment +if debug == False: + os.popen("/usr/bin/hugeadm --pool-pages-min DEFAULT:%sM" % userHugePageReqMB) + os.popen("/usr/bin/hugeadm --pool-pages-max DEFAULT:%sM" % userHugePageReqMB) + os.popen("/usr/bin/hugeadm --set-shm-group %d" % userGIDReq) + os.popen("/usr/bin/hugeadm --set-recommended-shmmax") +else: + print "/usr/bin/hugeadm --pool-pages-min DEFAULT:%sM" % userHugePageReqMB + print "/usr/bin/hugeadm --pool-pages-max DEFAULT:%sM" % userHugePageReqMB + print "/usr/bin/hugeadm --set-shm-group %d" % userGIDReq + print "/usr/bin/hugeadm --set-recommended-shmmax" + print + +# figure out what that shmmax value we just set was +hugeadmexplain = os.popen("/usr/bin/hugeadm --explain 2>/dev/null").readlines() +for line in hugeadmexplain: + if line.strip().startswith("kernel.shmmax = "): + shmmax = int(line.split()[2]) + break + +# write out sysctl config changes to persist across reboot +if debug == False: + sysctlConfLines = "# sysctl configuration\n" + if os.access(sysctlConf, os.W_OK): + try: + sysctlConfLines = open(sysctlConf).readlines() + os.rename(sysctlConf, sysctlConf + ".backup") + print("Saved original %s as %s.backup" % (sysctlConf, sysctlConf)) + except: + pass + + fd = open(sysctlConf, "w") + for line in sysctlConfLines: + if line.startswith("kernel.shmmax"): + continue + elif line.startswith("vm.nr_hugepages"): + continue + elif line.startswith("vm.hugetlb_shm_group"): + continue + else: + fd.write(line); + + fd.write("kernel.shmmax = %d\n" % shmmax) + fd.write("vm.nr_hugepages = %d\n" % userHugePagesReq) + fd.write("vm.hugetlb_shm_group = %d\n" % userGIDReq) + fd.close() + +else: + print "Add to %s:" % sysctlConf + print "kernel.shmmax = %d" % shmmax + print "vm.nr_hugepages = %d" % userHugePagesReq + print "vm.hugetlb_shm_group = %d" % userGIDReq + print + + +# write out limits.conf changes to persist across reboot +if debug == False: + limitsConfLines = "# Huge page access configuration\n" + if os.access(limitsConf, os.W_OK): + try: + limitsConfLines = open(limitsConf).readlines() + os.rename(limitsConf, limitsConf + ".backup") + print("Saved original %s as %s.backup" % (limitsConf, limitsConf)) + except: + pass + + fd = open(limitsConf, "w") + for line in limitsConfLines: + cfgExist = False + for hugeUser in hugePageUserList: + try: + if line.split()[0] == hugeUser: + cfgExist = True + except IndexError: + # hit either white or comment line, it is safe not to take + # any action and continue. + pass + if cfgExist == True: + continue + else: + fd.write(line) + + for hugeUser in hugePageUserList: + fd.write("%s soft memlock %d\n" % (hugeUser, userHugePageReqKB)) + fd.write("%s hard memlock %d\n" % (hugeUser, userHugePageReqKB)) + fd.close() + +else: + print "Add to %s:" % limitsConf + for hugeUser in hugePageUserList: + print "%s soft memlock %d" % (hugeUser, userHugePageReqKB) + print "%s hard memlock %d" % (hugeUser, userHugePageReqKB) + + +# dump the final configuration of things now that we're done tweaking +print +print "Final configuration:" +print " * Total System Memory......: %6d MB" % memTotal +if debug == False: + print " * Shared Mem Max Mapping...: %6d MB" % (shmmax / (1024 * 1024)) +else: + # This should be what we *would* have set it to, had we actually run hugeadm --set-recommended-shmmax + print " * Shared Mem Max Mapping...: %6d MB" % (userHugePagesReq * hugePageSize / (1024 * 1024)) +print " * System Huge Page Size....: %6d MB" % (hugePageSize / (1024 * 1024)) +print " * Available Huge Pages.....: %6d" % userHugePagesReq +print " * Total size of Huge Pages.: %6d MB" % (userHugePagesReq * hugePageSize / (1024 * 1024)) +print " * Remaining System Memory..: %6d MB" % (memTotal - userHugePageReqMB) +print " * Huge Page User Group.....: %s (%d)" % (userGroupReq, userGIDReq) +print + diff --git a/default/libhugetlbfs/libhugetlbfs/hugeadm.c b/default/libhugetlbfs/libhugetlbfs/hugeadm.c new file mode 100644 index 0000000..781f23c --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/hugeadm.c @@ -0,0 +1,1699 @@ +/*************************************************************************** + * User front end for using huge pages Copyright (C) 2008, IBM * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the Lesser GNU General Public License as * + * published by the Free Software Foundation; either version 2.1 of the * + * License, or at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the Lesser GNU General Public * + * License along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + +/* + * hugeadm is designed to make an administrators life simpler, to automate + * and simplify basic system configuration as it relates to hugepages. It + * is designed to help with pool and mount configuration. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <errno.h> +#include <string.h> +#include <limits.h> +#include <mntent.h> +#include <unistd.h> +#include <grp.h> +#include <pwd.h> +#include <fcntl.h> + +#include <sys/stat.h> +#include <sys/types.h> +#include <sys/mount.h> +#include <sys/swap.h> +#include <sys/wait.h> + +#define _GNU_SOURCE /* for getopt_long */ +#include <unistd.h> +#include <getopt.h> + +#define KB (1024) +#define MB (1024*KB) +#define GB (1024*MB) + +#define REPORT_UTIL "hugeadm" +#define REPORT(level, prefix, format, ...) \ + do { \ + if (verbose_level >= level) \ + fprintf(stderr, "hugeadm:" prefix ": " format, \ + ##__VA_ARGS__); \ + } while (0); + +#include "libhugetlbfs_internal.h" +#include "hugetlbfs.h" + +extern int optind; +extern char *optarg; + +#define OPTION(opts, text) fprintf(stderr, " %-25s %s\n", opts, text) +#define CONT(text) fprintf(stderr, " %-25s %s\n", "", text) + +#define MOUNT_DIR "/var/lib/hugetlbfs" +#define OPT_MAX 4096 + +#define PROCMOUNTS "/proc/mounts" +#define PROCHUGEPAGES_MOVABLE "/proc/sys/vm/hugepages_treat_as_movable" +#define PROCMINFREEKBYTES "/proc/sys/vm/min_free_kbytes" +#define PROCSHMMAX "/proc/sys/kernel/shmmax" +#define PROCHUGETLBGROUP "/proc/sys/vm/hugetlb_shm_group" +#define PROCZONEINFO "/proc/zoneinfo" +#define FS_NAME "hugetlbfs" +#define MIN_COL 20 +#define MAX_SIZE_MNTENT (64 + PATH_MAX + 32 + 128 + 2 * sizeof(int)) +#define FORMAT_LEN 20 + +#define MEM_TOTAL "MemTotal:" +#define SWAP_FREE "SwapFree:" +#define SWAP_TOTAL "SwapTotal:" + +#define ALWAYS "always" +#define MADVISE "madvise" +#define NEVER "never" +#define TRANS_ENABLE "/sys/kernel/mm/transparent_hugepage/enabled" +#define KHUGE_SCAN_PAGES "/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan" +#define KHUGE_SCAN_SLEEP "/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs" +#define KHUGE_ALLOC_SLEEP "/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs" + +void print_usage() +{ + fprintf(stderr, "hugeadm [options]\n"); + fprintf(stderr, "options:\n"); + + OPTION("--list-all-mounts", "List all current hugetlbfs mount points"); + OPTION("--pool-list", "List all pools"); + OPTION("--hard", "specified with --pool-pages-min to make"); + CONT("multiple attempts at adjusting the pool size to the"); + CONT("specified count on failure"); + OPTION("--pool-pages-min <size|DEFAULT>:[+|-]<pagecount|memsize<G|M|K>>", ""); + CONT("Adjust pool 'size' lower bound"); + OPTION("--obey-mempolicy", "Obey the NUMA memory policy when"); + CONT("adjusting the pool 'size' lower bound"); + OPTION("--thp-always", "Enable transparent huge pages always"); + OPTION("--thp-madvise", "Enable transparent huge pages with madvise"); + OPTION("--thp-never", "Disable transparent huge pages"); + OPTION("--thp-khugepaged-pages <pages to scan>", "Number of pages that khugepaged"); + CONT("should scan on each pass"); + OPTION("--thp-khugepaged-scan-sleep <milliseconds>", "Time in ms to sleep between"); + CONT("khugepaged passes"); + OPTION("--thp-khugepages-alloc-sleep <milliseconds>", "Time in ms for khugepaged"); + CONT("to wait if there was a huge page allocation failure"); + OPTION("--pool-pages-max <size|DEFAULT>:[+|-]<pagecount|memsize<G|M|K>>", ""); + CONT("Adjust pool 'size' upper bound"); + OPTION("--set-recommended-min_free_kbytes", ""); + CONT("Sets min_free_kbytes to a recommended value to improve availability of"); + CONT("huge pages at runtime"); + OPTION("--set-recommended-shmmax", "Sets shmmax to a recommended value to"); + CONT("maximise the size possible for shared memory pools"); + OPTION("--set-shm-group <gid|groupname>", "Sets hugetlb_shm_group to the"); + CONT("specified group, which has permission to use hugetlb shared memory pools"); + OPTION("--add-temp-swap[=count]", "Specified with --pool-pages-min to create"); + CONT("temporary swap space for the duration of the pool resize. Default swap"); + CONT("size is 5 huge pages. Optional arg sets size to 'count' huge pages"); + OPTION("--add-ramdisk-swap", "Specified with --pool-pages-min to create"); + CONT("swap space on ramdisks. By default, swap is removed after the resize."); + OPTION("--persist", "Specified with --add-temp-swap or --add-ramdisk-swap"); + CONT("options to make swap space persist after the resize."); + OPTION("--enable-zone-movable", "Use ZONE_MOVABLE for huge pages"); + OPTION("--disable-zone-movable", "Do not use ZONE_MOVABLE for huge pages"); + OPTION("--create-mounts", "Creates a mount point for each available"); + CONT("huge page size on this system under /var/lib/hugetlbfs"); + OPTION("--create-user-mounts <user>", ""); + CONT("Creates a mount point for each available huge"); + CONT("page size under /var/lib/hugetlbfs/<user>"); + CONT("usable by user <user>"); + OPTION("--create-group-mounts <group>", ""); + CONT("Creates a mount point for each available huge"); + CONT("page size under /var/lib/hugetlbfs/<group>"); + CONT("usable by group <group>"); + OPTION("--create-global-mounts", ""); + CONT("Creates a mount point for each available huge"); + CONT("page size under /var/lib/hugetlbfs/global"); + CONT("usable by anyone"); + + OPTION("--max-size <size<G|M|K>>", "Limit the filesystem size of a new mount point"); + OPTION("--max-inodes <number>", "Limit the number of inodes on a new mount point"); + + OPTION("--page-sizes", "Display page sizes that a configured pool"); + OPTION("--page-sizes-all", + "Display page sizes support by the hardware"); + OPTION("--dry-run", "Print the equivalent shell commands for what"); + CONT("the specified options would have done without"); + CONT("taking any action"); + + OPTION("--explain", "Gives a overview of the status of the system"); + CONT("with respect to huge page availability"); + + OPTION("--verbose <level>, -v", "Increases/sets tracing levels"); + OPTION("--help, -h", "Prints this message"); +} + +int opt_dry_run = 0; +int opt_hard = 0; +int opt_movable = -1; +int opt_set_recommended_minfreekbytes = 0; +int opt_set_recommended_shmmax = 0; +int opt_set_hugetlb_shm_group = 0; +int opt_temp_swap = 0; +int opt_ramdisk_swap = 0; +int opt_swap_persist = 0; +int opt_obey_mempolicy = 0; +unsigned long opt_limit_mount_size = 0; +int opt_limit_mount_inodes = 0; +int verbose_level = VERBOSITY_DEFAULT; +char ramdisk_list[PATH_MAX] = ""; + +void setup_environment(char *var, char *val) +{ + if (opt_dry_run) { + printf("%s='%s'\n", var, val); + return; + } + + setenv(var, val, 1); + DEBUG("%s='%s'\n", var, val); +} + +/* Enable/disable allocation of hugepages from ZONE_MOVABLE */ +void setup_zone_movable(int able) +{ + if (opt_dry_run) { + printf("echo %d > %s\n", able, PROCHUGEPAGES_MOVABLE); + return; + } + + DEBUG("Setting %s to %d\n", PROCHUGEPAGES_MOVABLE, able); + + /* libhugetlbfs reports any error that occurs */ + file_write_ulong(PROCHUGEPAGES_MOVABLE, (unsigned long)able); +} + +void verbose_init(void) +{ + char *env; + + env = getenv("HUGETLB_VERBOSE"); + if (env) + verbose_level = atoi(env); + env = getenv("HUGETLB_DEBUG"); + if (env) + verbose_level = VERBOSITY_MAX; +} + +void verbose(char *which) +{ + int new_level; + + if (which) { + new_level = atoi(which); + if (new_level < 0 || new_level > 99) { + ERROR("%d: verbosity out of range 0-99\n", + new_level); + exit(EXIT_FAILURE); + } + } else { + new_level = verbose_level + 1; + if (new_level == 100) { + WARNING("verbosity limited to 99\n"); + new_level--; + } + } + verbose_level = new_level; +} + +void verbose_expose(void) +{ + char level[3]; + + if (verbose_level == 99) { + setup_environment("HUGETLB_DEBUG", "yes"); + } + snprintf(level, sizeof(level), "%d", verbose_level); + setup_environment("HUGETLB_VERBOSE", level); +} + +/* + * getopts return values for options which are long only. + */ +#define LONG_POOL ('p' << 8) +#define LONG_POOL_LIST (LONG_POOL|'l') +#define LONG_POOL_MIN_ADJ (LONG_POOL|'m') +#define LONG_POOL_MAX_ADJ (LONG_POOL|'M') +#define LONG_POOL_MEMPOL (LONG_POOL|'p') + +#define LONG_SET_RECOMMENDED_MINFREEKBYTES ('k' << 8) +#define LONG_SET_RECOMMENDED_SHMMAX ('x' << 8) +#define LONG_SET_HUGETLB_SHM_GROUP ('R' << 8) + +#define LONG_MOVABLE ('z' << 8) +#define LONG_MOVABLE_ENABLE (LONG_MOVABLE|'e') +#define LONG_MOVABLE_DISABLE (LONG_MOVABLE|'d') + +#define LONG_HARD ('h' << 8) +#define LONG_SWAP ('s' << 8) +#define LONG_SWAP_DISK (LONG_SWAP|'d') +#define LONG_SWAP_RAMDISK (LONG_SWAP|'r') +#define LONG_SWAP_PERSIST (LONG_SWAP|'p') + +#define LONG_PAGE ('P' << 8) +#define LONG_PAGE_SIZES (LONG_PAGE|'s') +#define LONG_PAGE_AVAIL (LONG_PAGE|'a') + +#define LONG_MOUNTS ('m' << 8) +#define LONG_CREATE_MOUNTS (LONG_MOUNTS|'C') +#define LONG_CREATE_USER_MOUNTS (LONG_MOUNTS|'U') +#define LONG_CREATE_GROUP_MOUNTS (LONG_MOUNTS|'g') +#define LONG_CREATE_GLOBAL_MOUNTS (LONG_MOUNTS|'G') +#define LONG_LIST_ALL_MOUNTS (LONG_MOUNTS|'A') + +#define LONG_LIMITS ('l' << 8) +#define LONG_LIMIT_SIZE (LONG_LIMITS|'S') +#define LONG_LIMIT_INODES (LONG_LIMITS|'I') + +#define LONG_EXPLAIN ('e' << 8) + +#define LONG_TRANS ('t' << 8) +#define LONG_TRANS_ALWAYS (LONG_TRANS|'a') +#define LONG_TRANS_MADVISE (LONG_TRANS|'m') +#define LONG_TRANS_NEVER (LONG_TRANS|'n') + +#define LONG_KHUGE ('K' << 8) +#define LONG_KHUGE_PAGES (LONG_KHUGE|'p') +#define LONG_KHUGE_SCAN (LONG_KHUGE|'s') +#define LONG_KHUGE_ALLOC (LONG_KHUGE|'a') + +#define MAX_POOLS 32 + +static int cmpsizes(const void *p1, const void *p2) +{ + return ((struct hpage_pool *)p1)->pagesize > + ((struct hpage_pool *)p2)->pagesize; +} + +void pool_list(void) +{ + struct hpage_pool pools[MAX_POOLS]; + int pos; + int cnt; + + cnt = hpool_sizes(pools, MAX_POOLS); + if (cnt < 0) { + ERROR("unable to obtain pools list"); + exit(EXIT_FAILURE); + } + qsort(pools, cnt, sizeof(pools[0]), cmpsizes); + + printf("%10s %8s %8s %8s %8s\n", + "Size", "Minimum", "Current", "Maximum", "Default"); + for (pos = 0; cnt--; pos++) { + printf("%10ld %8ld %8ld %8ld %8s\n", pools[pos].pagesize, + pools[pos].minimum, pools[pos].size, + pools[pos].maximum, (pools[pos].is_default) ? "*" : ""); + } +} + +struct mount_list +{ + struct mntent entry; + char data[MAX_SIZE_MNTENT]; + struct mount_list *next; +}; + +void print_mounts(struct mount_list *current, int longest) +{ + char format_str[FORMAT_LEN]; + + snprintf(format_str, FORMAT_LEN, "%%-%ds %%s\n", longest); + printf(format_str, "Mount Point", "Options"); + while (current) { + printf(format_str, current->entry.mnt_dir, + current->entry.mnt_opts); + current = current->next; + } +} + +/* + * collect_active_mounts returns a list of active hugetlbfs + * mount points, and, if longest is not NULL, the number of + * characters in the longest mount point to ease output + * formatting. Caller is expected to free the list of mounts. + */ +struct mount_list *collect_active_mounts(int *longest) +{ + FILE *mounts; + struct mount_list *list, *current, *previous = NULL; + int length; + + /* First try /proc/mounts, then /etc/mtab */ + mounts = setmntent(PROCMOUNTS, "r"); + if (!mounts) { + mounts = setmntent(MOUNTED, "r"); + if (!mounts) { + ERROR("unable to open %s or %s for reading", + PROCMOUNTS, MOUNTED); + exit(EXIT_FAILURE); + } + } + + list = malloc(sizeof(struct mount_list)); + if (!list) { + ERROR("out of memory"); + exit(EXIT_FAILURE); + } + + list->next = NULL; + current = list; + while (getmntent_r(mounts, &(current->entry), current->data, MAX_SIZE_MNTENT)) { + if (strcasecmp(current->entry.mnt_type, FS_NAME) == 0) { + length = strlen(current->entry.mnt_dir); + if (longest && length > *longest) + *longest = length; + + current->next = malloc(sizeof(struct mount_list)); + if (!current->next) { + ERROR("out of memory"); + exit(EXIT_FAILURE); + } + previous = current; + current = current->next; + current->next = NULL; + } + } + + endmntent(mounts); + + if (previous) { + free(previous->next); + previous->next = NULL; + return list; + } + return NULL; +} + +void mounts_list_all(void) +{ + struct mount_list *list, *previous; + int longest = MIN_COL; + + list = collect_active_mounts(&longest); + + if (!list) { + ERROR("No hugetlbfs mount points found\n"); + return; + } + + print_mounts(list, longest); + + while (list) { + previous = list; + list = list->next; + free(previous); + } +} + +int make_dir(char *path, mode_t mode, uid_t uid, gid_t gid) +{ + struct passwd *pwd; + struct group *grp; + + if (opt_dry_run) { + pwd = getpwuid(uid); + grp = getgrgid(gid); + printf("if [ ! -e %s ]\n", path); + printf("then\n"); + printf(" mkdir %s\n", path); + printf(" chown %s:%s %s\n", pwd->pw_name, grp->gr_name, path); + printf(" chmod %o %s\n", mode, path); + printf("fi\n"); + return 0; + } + + if (mkdir(path, mode)) { + if (errno != EEXIST) { + ERROR("Unable to create dir %s, error: %s\n", + path, strerror(errno)); + return 1; + } + } else { + if (chown(path, uid, gid)) { + ERROR("Unable to change ownership of %s, error: %s\n", + path, strerror(errno)); + return 1; + } + + if (chmod(path, mode)) { + ERROR("Unable to change permission on %s, error: %s\n", + path, strerror(errno)); + return 1; + } + } + + return 0; +} + +/** + * ensure_dir will build the entire directory structure up to and + * including path, all directories built will be owned by + * user:group and permissions will be set to mode. + */ +int ensure_dir(char *path, mode_t mode, uid_t uid, gid_t gid) +{ + char *idx; + + if (!path || strlen(path) == 0) + return 0; + + idx = strchr(path + 1, '/'); + + do { + if (idx) + *idx = '\0'; + + if (make_dir(path, mode, uid, gid)) + return 1; + + if (idx) { + *idx = '/'; + idx++; + } + } while ((idx = strchr(idx, '/')) != NULL); + + if (make_dir(path, mode, uid, gid)) + return 1; + + return 0; +} + +int check_if_already_mounted(struct mount_list *list, char *path) +{ + while (list) { + if (!strcmp(list->entry.mnt_dir, path)) + return 1; + list = list->next; + } + return 0; +} + +int mount_dir(char *path, char *options, mode_t mode, uid_t uid, gid_t gid) +{ + struct passwd *pwd; + struct group *grp; + struct mntent entry; + FILE *mounts; + struct mount_list *list, *previous; + + list = collect_active_mounts(NULL); + + if (list && check_if_already_mounted(list, path)) { + WARNING("Directory %s is already mounted.\n", path); + + while (list) { + previous = list; + list = list->next; + free(previous); + } + return 0; + } + + while (list) { + previous = list; + list = list->next; + free(previous); + } + + if (opt_dry_run) { + pwd = getpwuid(uid); + grp = getgrgid(gid); + printf("mount -t %s none %s -o %s\n", FS_NAME, + path, options); + printf("chown %s:%s %s\n", pwd->pw_name, grp->gr_name, + path); + printf("chmod %o %s\n", mode, path); + } else { + if (mount("none", path, FS_NAME, 0, options)) { + ERROR("Unable to mount %s, error: %s\n", + path, strerror(errno)); + return 1; + } + + mounts = setmntent(MOUNTED, "a+"); + if (mounts) { + entry.mnt_fsname = FS_NAME; + entry.mnt_dir = path; + entry.mnt_type = FS_NAME; + entry.mnt_opts = options; + entry.mnt_freq = 0; + entry.mnt_passno = 0; + if (addmntent(mounts, &entry)) + WARNING("Unable to add entry %s to %s, error: %s\n", + path, MOUNTED, strerror(errno)); + endmntent(mounts); + } else { + WARNING("Unable to open %s, error: %s\n", + MOUNTED, strerror(errno)); + } + + if (chown(path, uid, gid)) { + ERROR("Unable to change ownership of %s, error: %s\n", + path, strerror(errno)); + return 1; + } + + if (chmod(path, mode)) { + ERROR("Unable to set permissions on %s, error: %s\n", + path, strerror(errno)); + return 1; + } + } + return 0; +} + +void scale_size(char *buf, unsigned long pagesize) +{ + if(pagesize >= GB) + snprintf(buf, OPT_MAX, "%luGB", pagesize / GB); + else if(pagesize >= MB) + snprintf(buf, OPT_MAX, "%luMB", pagesize / MB); + else + snprintf(buf, OPT_MAX, "%luKB", pagesize / KB); +} + +void create_mounts(char *user, char *group, char *base, mode_t mode) +{ + struct hpage_pool pools[MAX_POOLS]; + char path[PATH_MAX]; + char options[OPT_MAX]; + char limits[OPT_MAX]; + char scaled[OPT_MAX]; + int cnt, pos; + struct passwd *pwd; + struct group *grp; + uid_t uid = 0; + gid_t gid = 0; + + if (geteuid() != 0) { + ERROR("Mounts can only be created by root\n"); + exit(EXIT_FAILURE); + } + + if (user) { + pwd = getpwnam(user); + if (!pwd) { + ERROR("Could not find specified user %s\n", user); + exit(EXIT_FAILURE); + } + uid = pwd->pw_uid; + } else if (group) { + grp = getgrnam(group); + if (!grp) { + ERROR("Could not find specified group %s\n", group); + exit(EXIT_FAILURE); + } + gid = grp->gr_gid; + } + + if (ensure_dir(base, + S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH, 0, 0)) + exit(EXIT_FAILURE); + + cnt = hpool_sizes(pools, MAX_POOLS); + if (cnt < 0) { + ERROR("Unable to obtain pools list\n"); + exit(EXIT_FAILURE); + } + + for (pos=0; cnt--; pos++) { + scaled[0] = 0; + scale_size(scaled, pools[pos].pagesize); + if (user) + snprintf(path, PATH_MAX, "%s/%s/pagesize-%s", + base, user, scaled); + else if (group) + snprintf(path, PATH_MAX, "%s/%s/pagesize-%s", + base, group, scaled); + else + snprintf(path, PATH_MAX, "%s/pagesize-%s", + base, scaled); + + snprintf(options, OPT_MAX, "pagesize=%ld", + pools[pos].pagesize); + + /* Yes, this could be cleverer */ + if (opt_limit_mount_size && opt_limit_mount_inodes) + snprintf(limits, OPT_MAX, ",size=%lu,nr_inodes=%d", + opt_limit_mount_size, opt_limit_mount_inodes); + else { + if (opt_limit_mount_size) + snprintf(limits, OPT_MAX, ",size=%lu", + opt_limit_mount_size); + if (opt_limit_mount_inodes) + snprintf(limits, OPT_MAX, ",nr_inodes=%d", + opt_limit_mount_inodes); + } + + /* Append limits if specified */ + if (limits[0] != 0) { + size_t maxlen = OPT_MAX - strlen(options); + if (maxlen > strlen(limits)) + strcat(options, limits); + else + WARNING("String limitations met, cannot append limitations onto mount options string. Increase OPT_MAX"); + } + + if (ensure_dir(path, mode, uid, gid)) + exit(EXIT_FAILURE); + + if (mount_dir(path, options, mode, uid, gid)) + exit(EXIT_FAILURE); + } +} + +/** + * show_mem shouldn't change the behavior of any of its + * callers, it only prints a message to the user showing the + * total amount of memory in the system (in megabytes). + */ +void show_mem() +{ + long mem_total; + + mem_total = read_meminfo(MEM_TOTAL); + printf("Total System Memory: %ld MB\n\n", mem_total / 1024); +} + +/** + * check_swap shouldn't change the behavior of any of its + * callers, it only prints a message to the user if something + * is being done that might fail without swap available. i.e. + * resizing a huge page pool + */ +void check_swap() +{ + long swap_sz; + long swap_total; + + swap_total = read_meminfo(SWAP_TOTAL); + if (swap_total <= 0) { + WARNING("There is no swap space configured, resizing hugepage pool may fail\n"); + WARNING("Use --add-temp-swap option to temporarily add swap during the resize\n"); + return; + } + + swap_sz = read_meminfo(SWAP_FREE); + /* meminfo keeps values in kb, but we use bytes for hpage sizes */ + swap_sz *= 1024; + if (swap_sz <= gethugepagesize()) { + WARNING("There is very little swap space free, resizing hugepage pool may fail\n"); + WARNING("Use --add-temp-swap option to temporarily add swap during the resize\n"); + } +} + +#define ZONEINFO_LINEBUF 1024 +long recommended_minfreekbytes(void) +{ + FILE *f; + char buf[ZONEINFO_LINEBUF]; + int nr_zones = 0; + long recommended_min; + long pageblock_kbytes = kernel_default_hugepage_size() / 1024; + + /* Detect the number of zones in the system */ + f = fopen(PROCZONEINFO, "r"); + if (f == NULL) { + WARNING("Unable to open " PROCZONEINFO); + return 0; + } + while (fgets(buf, ZONEINFO_LINEBUF, f) != NULL) { + if (strncmp(buf, "Node ", 5) == 0) + nr_zones++; + } + fclose(f); + + /* Make sure at least 2 pageblocks are free for MIGRATE_RESERVE */ + recommended_min = pageblock_kbytes * nr_zones * 2; + + /* + * Make sure that on average at least two pageblocks are almost free + * of another type, one for a migratetype to fall back to and a + * second to avoid subsequent fallbacks of other types There are 3 + * MIGRATE_TYPES we care about. + */ + recommended_min += pageblock_kbytes * nr_zones * 3 * 3; + return recommended_min; +} + +void set_recommended_minfreekbytes(void) +{ + long recommended_min = recommended_minfreekbytes(); + + if (opt_dry_run) { + printf("echo \"%ld\" > %s\n", recommended_min, + PROCMINFREEKBYTES); + return; + } + + DEBUG("Setting min_free_kbytes to %ld\n", recommended_min); + file_write_ulong(PROCMINFREEKBYTES, (unsigned long)recommended_min); +} + +/* + * check_minfreekbytes does not alter the value of min_free_kbytes. It just + * reports what the current value is and what it should be + */ +void check_minfreekbytes(void) +{ + long min_free_kbytes = file_read_ulong(PROCMINFREEKBYTES, NULL); + long recommended_min = recommended_minfreekbytes(); + + /* There should be at least one pageblock free per zone in the system */ + if (recommended_min > min_free_kbytes) { + printf("\n"); + printf("The " PROCMINFREEKBYTES " of %ld is too small. To maximiuse efficiency\n", min_free_kbytes); + printf("of fragmentation avoidance, there should be at least one huge page free per zone\n"); + printf("in the system which minimally requires a min_free_kbytes value of %ld\n", recommended_min); + } +} + +unsigned long long recommended_shmmax(void) +{ + struct hpage_pool pools[MAX_POOLS]; + unsigned long long recommended_shmmax = 0; + int pos, cnt; + + cnt = hpool_sizes(pools, MAX_POOLS); + if (cnt < 0) { + ERROR("unable to obtain pools list"); + exit(EXIT_FAILURE); + } + + for (pos = 0; cnt--; pos++) + recommended_shmmax += ((unsigned long long)pools[pos].maximum * + pools[pos].pagesize); + + return recommended_shmmax; +} + +void set_recommended_shmmax(void) +{ + int ret; + unsigned long max_recommended = -1UL; + unsigned long long recommended = recommended_shmmax(); + + if (recommended == 0) { + printf("\n"); + WARNING("We can only set a recommended shmmax when huge pages are configured!\n"); + return; + } + + if (recommended > max_recommended) + recommended = max_recommended; + + DEBUG("Setting shmmax to %llu\n", recommended); + ret = file_write_ulong(PROCSHMMAX, (unsigned long)recommended); + + if (!ret) { + INFO("To make shmmax settings persistent, add the following line to /etc/sysctl.conf:\n"); + INFO(" kernel.shmmax = %llu\n", recommended); + } +} + +void check_shmmax(void) +{ + long current_shmmax = file_read_ulong(PROCSHMMAX, NULL); + long recommended = recommended_shmmax(); + + if (current_shmmax != recommended) { + printf("\n"); + printf("A " PROCSHMMAX " value of %ld bytes may be sub-optimal. To maximise\n", current_shmmax); + printf("shared memory usage, this should be set to the size of the largest shared memory\n"); + printf("segment size you want to be able to use. Alternatively, set it to a size matching\n"); + printf("the maximum possible allocation size of all huge pages. This can be done\n"); + printf("automatically, using the --set-recommended-shmmax option.\n"); + } + + if (recommended == 0) { + printf("\n"); + WARNING("We can't make a shmmax recommendation until huge pages are configured!\n"); + return; + } + + printf("\n"); + printf("The recommended shmmax for your currently allocated huge pages is %ld bytes.\n", recommended); + printf("To make shmmax settings persistent, add the following line to /etc/sysctl.conf:\n"); + printf(" kernel.shmmax = %ld\n", recommended); +} + +void set_hugetlb_shm_group(gid_t gid, char *group) +{ + int ret; + + DEBUG("Setting hugetlb_shm_group to %d (%s)\n", gid, group); + ret = file_write_ulong(PROCHUGETLBGROUP, (unsigned long)gid); + + if (!ret) { + INFO("To make hugetlb_shm_group settings persistent, add the following line to /etc/sysctl.conf:\n"); + INFO(" vm.hugetlb_shm_group = %d\n", gid); + } +} + +/* heisted from shadow-utils/libmisc/list.c::is_on_list() */ +static int user_in_group(char *const *list, const char *member) +{ + while (*list != NULL) { + if (strcmp(*list, member) == 0) { + return 1; + } + list++; + } + + return 0; +} + +void check_user(void) +{ + uid_t uid; + gid_t gid; + struct passwd *pwd; + struct group *grp; + + gid = (gid_t)file_read_ulong(PROCHUGETLBGROUP, NULL); + grp = getgrgid(gid); + if (!grp) { + printf("\n"); + WARNING("Group ID %d in hugetlb_shm_group doesn't appear to be a valid group!\n", gid); + return; + } + + uid = getuid(); + pwd = getpwuid(uid); + + /* Don't segfault if user does not have a passwd entry. */ + if (!pwd) { + printf("\n"); + WARNING("User uid %d is not in the password file!\n", uid); + return; + } + + if (gid != pwd->pw_gid && !user_in_group(grp->gr_mem, pwd->pw_name) && uid != 0) { + printf("\n"); + WARNING("User %s (uid: %d) is not a member of the hugetlb_shm_group %s (gid: %d)!\n", pwd->pw_name, uid, grp->gr_name, gid); + } else { + printf("\n"); + printf("To make your hugetlb_shm_group settings persistent, add the following line to /etc/sysctl.conf:\n"); + printf(" vm.hugetlb_shm_group = %d\n", gid); + } +} + +void add_temp_swap(long page_size) +{ + char path[PATH_MAX]; + char file[PATH_MAX]; + char mkswap_cmd[PATH_MAX]; + FILE *f; + char *buf; + long swap_size; + long pid; + int ret; + int num_pages; + + if (geteuid() != 0) { + ERROR("Swap can only be manipulated by root\n"); + exit(EXIT_FAILURE); + } + + pid = getpid(); + snprintf(path, PATH_MAX, "%s/swap/temp", MOUNT_DIR); + snprintf(file, PATH_MAX, "%s/swapfile-%ld", path, pid); + + /* swapsize is 5 hugepages */ + if (opt_temp_swap == -1) + num_pages = 5; + else + num_pages = opt_temp_swap; + swap_size = num_pages * page_size; + + if (ensure_dir(path, S_IRWXU | S_IRGRP | S_IXGRP, 0, 0)) + exit(EXIT_FAILURE); + + if (opt_dry_run) { + printf("dd bs=1024 count=%ld if=/dev/zero of=%s\n", + swap_size / 1024, file); + printf("mkswap %s\nswapon %s\n", file, file); + return; + } + + f = fopen(file, "wx"); + if (!f) { + WARNING("Couldn't open %s: %s\n", file, strerror(errno)); + opt_temp_swap = 0; + return; + } + + buf = malloc(swap_size); + memset(buf, 0, swap_size); + fwrite(buf, sizeof(char), swap_size, f); + free(buf); + fclose(f); + + snprintf(mkswap_cmd, PATH_MAX, "mkswap %s", file); + ret = system(mkswap_cmd); + if (WIFSIGNALED(ret)) { + WARNING("Call to mkswap failed\n"); + opt_temp_swap = 0; + return; + } else if (WIFEXITED(ret)) { + ret = WEXITSTATUS(ret); + if (ret) { + WARNING("Call to mkswap failed\n"); + opt_temp_swap = 0; + return; + } + } + + DEBUG("swapon %s\n", file); + if (swapon(file, 0)) { + WARNING("swapon on %s failed: %s\n", file, strerror(errno)); + opt_temp_swap = 0; + } +} + +void rem_temp_swap() { + char file[PATH_MAX]; + long pid; + + pid = getpid(); + snprintf(file, PATH_MAX, "%s/swap/temp/swapfile-%ld", MOUNT_DIR, pid); + + if (opt_dry_run) { + printf("swapoff %s\nrm -f %s\n", file, file); + return; + } + + if (swapoff(file)) + WARNING("swapoff on %s failed: %s\n", file, strerror(errno)); + remove(file); + DEBUG("swapoff %s\n", file); +} + +void add_ramdisk_swap(long page_size) { + char ramdisk[PATH_MAX]; + char mkswap_cmd[PATH_MAX]; + int disk_num=0; + int count = 0; + long ramdisk_size; + int ret; + int fd; + + snprintf(ramdisk, PATH_MAX, "/dev/ram%i", disk_num); + fd = open(ramdisk, O_RDONLY); + ioctl(fd, BLKGETSIZE, &ramdisk_size); + close(fd); + + ramdisk_size = ramdisk_size * 512; + count = (page_size/ramdisk_size) + 1; + + if (count > 1) { + INFO("Swap will be initialized on multiple ramdisks because\n\ + ramdisk size is less than huge page size. To avoid\n\ + this in the future, use kernel command line parameter\n\ + ramdisk_size=N, to set ramdisk size to N blocks.\n"); + } + + while (count > 0) { + snprintf(ramdisk, PATH_MAX, "/dev/ram%i", disk_num); + if (access(ramdisk, F_OK) != 0){ + break; + } + disk_num++; + + if (opt_dry_run) { + printf("mkswap %s\nswapon %s\n", ramdisk, ramdisk); + } else { + snprintf(mkswap_cmd, PATH_MAX, "mkswap %s", ramdisk); + ret = system(mkswap_cmd); + if (WIFSIGNALED(ret)) { + WARNING("Call to mkswap failed\n"); + continue; + } else if (WIFEXITED(ret)) { + ret = WEXITSTATUS(ret); + if (ret) { + WARNING("Call to mkswap failed\n"); + continue; + } + } + DEBUG("swapon %s\n", ramdisk); + if (swapon(ramdisk, 0)) { + WARNING("swapon on %s failed: %s\n", ramdisk, strerror(errno)); + opt_temp_swap = 0; + continue; + } + } + count--; + strcat(ramdisk_list, " "); + strcat(ramdisk_list, ramdisk); + } +} + +void rem_ramdisk_swap(){ + char *ramdisk; + char *iter = NULL; + + ramdisk = strtok_r(ramdisk_list, " ", &iter); + while (ramdisk != NULL) { + if (opt_dry_run) { + printf("swapoff %s\n", ramdisk); + } else { + DEBUG("swapoff %s\n", ramdisk); + if (swapoff(ramdisk)) { + WARNING("swapoff on %s failed: %s\n", ramdisk, strerror(errno)); + continue; + } + } + ramdisk = strtok_r(NULL, " ", &iter); + } +} + +void set_trans_opt(const char *file, const char *value) +{ + FILE *f; + + if (geteuid() != 0) { + ERROR("Transparent huge page options can only be set by root\n"); + exit(EXIT_FAILURE); + } + + if (opt_dry_run) { + printf("echo '%s' > %s\n", value, file); + return; + } + + f = fopen(file, "w"); + if (!f) { + ERROR("Couldn't open %s: %s\n", file, strerror(errno)); + return; + } + + fprintf(f, "%s", value); + fclose(f); +} + +enum { + POOL_MIN, + POOL_MAX, + POOL_BOTH, +}; + +static long value_adjust(char *adjust_str, long base, long page_size) +{ + long long adjust; + char *iter; + + /* Convert and validate the adjust. */ + errno = 0; + adjust = strtol(adjust_str, &iter, 0); + /* Catch strtol errors and sizes that overflow the native word size */ + if (errno || adjust_str == iter) { + if (errno == ERANGE) + errno = EOVERFLOW; + else + errno = EINVAL; + ERROR("%s: invalid adjustment\n", adjust_str); + exit(EXIT_FAILURE); + } + + switch (*iter) { + case 'G': + case 'g': + adjust = size_to_smaller_unit(adjust); + case 'M': + case 'm': + adjust = size_to_smaller_unit(adjust); + case 'K': + case 'k': + adjust = size_to_smaller_unit(adjust); + adjust = adjust / page_size; + } + + if (adjust_str[0] != '+' && adjust_str[0] != '-') + base = 0; + + /* Ensure we neither go negative nor exceed LONG_MAX. */ + if (adjust < 0 && -adjust > base) { + adjust = -base; + } + if (adjust > 0 && (base + adjust) < base) { + adjust = LONG_MAX - base; + } + base += adjust; + + DEBUG("Returning page count of %ld\n", base); + + return base; +} + + +void pool_adjust(char *cmd, unsigned int counter) +{ + struct hpage_pool pools[MAX_POOLS]; + int pos; + int cnt; + + char *iter = NULL; + char *page_size_str = NULL; + char *adjust_str = NULL; + long page_size; + + unsigned long min; + unsigned long min_orig; + unsigned long max; + unsigned long last_pool_value; + + /* Extract the pagesize and adjustment. */ + page_size_str = strtok_r(cmd, ":", &iter); + if (page_size_str) + adjust_str = strtok_r(NULL, ":", &iter); + + if (!page_size_str || !adjust_str) { + ERROR("%s: invalid resize specification\n", cmd); + exit(EXIT_FAILURE); + } + INFO("page_size<%s> adjust<%s> counter<%d>\n", + page_size_str, adjust_str, counter); + + /* Convert and validate the page_size. */ + if (strcmp(page_size_str, "DEFAULT") == 0) + page_size = kernel_default_hugepage_size(); + else + page_size = parse_page_size(page_size_str); + + DEBUG("Working with page_size of %ld\n", page_size); + + cnt = hpool_sizes(pools, MAX_POOLS); + if (cnt < 0) { + ERROR("unable to obtain pools list"); + exit(EXIT_FAILURE); + } + for (pos = 0; cnt--; pos++) { + if (pools[pos].pagesize == page_size) + break; + } + if (cnt < 0) { + ERROR("%s: unknown page size\n", page_size_str); + exit(EXIT_FAILURE); + } + + min_orig = min = pools[pos].minimum; + max = pools[pos].maximum; + + if (counter == POOL_BOTH) { + min = value_adjust(adjust_str, min, page_size); + max = min; + } else if (counter == POOL_MIN) { + min = value_adjust(adjust_str, min, page_size); + if (min > max) + max = min; + } else { + max = value_adjust(adjust_str, max, page_size); + if (max < min) + min = max; + } + + INFO("%ld, %ld -> %ld, %ld\n", pools[pos].minimum, pools[pos].maximum, + min, max); + + if ((pools[pos].maximum - pools[pos].minimum) < (max - min)) { + INFO("setting HUGEPAGES_OC to %ld\n", (max - min)); + set_huge_page_counter(page_size, HUGEPAGES_OC, (max - min)); + } + + if (opt_hard) + cnt = 5; + else + cnt = -1; + + if (min > min_orig) { + if (opt_temp_swap) + add_temp_swap(page_size); + if (opt_ramdisk_swap) + add_ramdisk_swap(page_size); + check_swap(); + } + + if (opt_obey_mempolicy && get_huge_page_counter(page_size, + HUGEPAGES_TOTAL_MEMPOL) < 0) { + opt_obey_mempolicy = 0; + WARNING("Counter for NUMA huge page allocations is not found, continuing with normal pool adjustment\n"); + } + + INFO("setting HUGEPAGES_TOTAL%s to %ld\n", + opt_obey_mempolicy ? "_MEMPOL" : "", min); + set_huge_page_counter(page_size, + opt_obey_mempolicy ? HUGEPAGES_TOTAL_MEMPOL : HUGEPAGES_TOTAL, + min); + get_pool_size(page_size, &pools[pos]); + + /* If we fail to make an allocation, retry if user requests */ + last_pool_value = pools[pos].minimum; + while ((pools[pos].minimum != min) && (cnt > 0)) { + /* Make note if progress is being made and sleep for IO */ + if (last_pool_value == pools[pos].minimum) + cnt--; + else + cnt = 5; + sleep(6); + + last_pool_value = pools[pos].minimum; + INFO("Retrying allocation HUGEPAGES_TOTAL%s to %ld current %ld\n", opt_obey_mempolicy ? "_MEMPOL" : "", min, pools[pos].minimum); + set_huge_page_counter(page_size, + opt_obey_mempolicy ? + HUGEPAGES_TOTAL_MEMPOL : + HUGEPAGES_TOTAL, + min); + get_pool_size(page_size, &pools[pos]); + } + + if (min > min_orig && !opt_swap_persist) { + if (opt_temp_swap) + rem_temp_swap(); + else if (opt_ramdisk_swap) + rem_ramdisk_swap(); + } + + /* + * HUGEPAGES_TOTAL is not guarenteed to check to exactly the figure + * requested should there be insufficient pages. Check the new + * value and adjust HUGEPAGES_OC accordingly. + */ + if (pools[pos].minimum != min) { + WARNING("failed to set pool minimum to %ld became %ld\n", + min, pools[pos].minimum); + min = pools[pos].minimum; + } + if (pools[pos].maximum != max) { + INFO("setting HUGEPAGES_OC to %ld\n", (max - min)); + set_huge_page_counter(page_size, HUGEPAGES_OC, (max - min)); + } +} + +void page_sizes(int all) +{ + struct hpage_pool pools[MAX_POOLS]; + int pos; + int cnt; + + cnt = hpool_sizes(pools, MAX_POOLS); + if (cnt < 0) { + ERROR("unable to obtain pools list"); + exit(EXIT_FAILURE); + } + qsort(pools, cnt, sizeof(pools[0]), cmpsizes); + + for (pos = 0; cnt--; pos++) { + if (all || (pools[pos].maximum && + hugetlbfs_find_path_for_size(pools[pos].pagesize))) + printf("%ld\n", pools[pos].pagesize); + } +} + +void explain() +{ + show_mem(); + mounts_list_all(); + printf("\nHuge page pools:\n"); + pool_list(); + printf("\nHuge page sizes with configured pools:\n"); + page_sizes(0); + check_minfreekbytes(); + check_shmmax(); + check_swap(); + check_user(); + printf("\nNote: Permanent swap space should be preferred when dynamic " + "huge page pools are used.\n"); +} + +int main(int argc, char** argv) +{ + int ops; + int has_hugepages = kernel_has_hugepages(); + + char opts[] = "+hdv"; + char base[PATH_MAX]; + char *opt_min_adj[MAX_POOLS], *opt_max_adj[MAX_POOLS]; + char *opt_user_mounts = NULL, *opt_group_mounts = NULL; + int opt_list_mounts = 0, opt_pool_list = 0, opt_create_mounts = 0; + int opt_global_mounts = 0, opt_pgsizes = 0, opt_pgsizes_all = 0; + int opt_explain = 0, minadj_count = 0, maxadj_count = 0; + int opt_trans_always = 0, opt_trans_never = 0, opt_trans_madvise = 0; + int opt_khuge_pages = 0, opt_khuge_scan = 0, opt_khuge_alloc = 0; + int ret = 0, index = 0; + char *khuge_pages = NULL, *khuge_alloc = NULL, *khuge_scan = NULL; + gid_t opt_gid = 0; + struct group *opt_grp = NULL; + int group_invalid = 0; + struct option long_opts[] = { + {"help", no_argument, NULL, 'h'}, + {"verbose", required_argument, NULL, 'v' }, + + {"list-all-mounts", no_argument, NULL, LONG_LIST_ALL_MOUNTS}, + {"pool-list", no_argument, NULL, LONG_POOL_LIST}, + {"pool-pages-min", required_argument, NULL, LONG_POOL_MIN_ADJ}, + {"pool-pages-max", required_argument, NULL, LONG_POOL_MAX_ADJ}, + {"obey-mempolicy", no_argument, NULL, LONG_POOL_MEMPOL}, + {"thp-always", no_argument, NULL, LONG_TRANS_ALWAYS}, + {"thp-madvise", no_argument, NULL, LONG_TRANS_MADVISE}, + {"thp-never", no_argument, NULL, LONG_TRANS_NEVER}, + {"thp-khugepaged-pages", required_argument, NULL, LONG_KHUGE_PAGES}, + {"thp-khugepaged-scan-sleep", required_argument, NULL, LONG_KHUGE_SCAN}, + {"thp-khugepaged-alloc-sleep", required_argument, NULL, LONG_KHUGE_ALLOC}, + {"set-recommended-min_free_kbytes", no_argument, NULL, LONG_SET_RECOMMENDED_MINFREEKBYTES}, + {"set-recommended-shmmax", no_argument, NULL, LONG_SET_RECOMMENDED_SHMMAX}, + {"set-shm-group", required_argument, NULL, LONG_SET_HUGETLB_SHM_GROUP}, + {"enable-zone-movable", no_argument, NULL, LONG_MOVABLE_ENABLE}, + {"disable-zone-movable", no_argument, NULL, LONG_MOVABLE_DISABLE}, + {"hard", no_argument, NULL, LONG_HARD}, + {"add-temp-swap", optional_argument, NULL, LONG_SWAP_DISK}, + {"add-ramdisk-swap", no_argument, NULL, LONG_SWAP_RAMDISK}, + {"persist", no_argument, NULL, LONG_SWAP_PERSIST}, + {"create-mounts", no_argument, NULL, LONG_CREATE_MOUNTS}, + {"create-user-mounts", required_argument, NULL, LONG_CREATE_USER_MOUNTS}, + {"create-group-mounts", required_argument, NULL, LONG_CREATE_GROUP_MOUNTS}, + {"create-global-mounts", no_argument, NULL, LONG_CREATE_GLOBAL_MOUNTS}, + + {"max-size", required_argument, NULL, LONG_LIMIT_SIZE}, + {"max-inodes", required_argument, NULL, LONG_LIMIT_INODES}, + + {"page-sizes", no_argument, NULL, LONG_PAGE_SIZES}, + {"page-sizes-all", no_argument, NULL, LONG_PAGE_AVAIL}, + {"dry-run", no_argument, NULL, 'd'}, + {"explain", no_argument, NULL, LONG_EXPLAIN}, + + {0}, + }; + + hugetlbfs_setup_debug(); + setup_mounts(); + verbose_init(); + + ops = 0; + while (ret != -1) { + ret = getopt_long(argc, argv, opts, long_opts, &index); + switch (ret) { + case -1: + break; + + case '?': + print_usage(); + exit(EXIT_FAILURE); + + case 'h': + print_usage(); + exit(EXIT_SUCCESS); + + case 'v': + verbose(optarg); + continue; + + case 'd': + opt_dry_run = 1; + continue; + + default: + /* All other commands require hugepage support. */ + if (! has_hugepages) { + ERROR("kernel does not support huge pages\n"); + exit(EXIT_FAILURE); + } + } + switch (ret) { + case -1: + break; + + case LONG_HARD: + opt_hard = 1; + continue; + + case LONG_SWAP_DISK: + if (optarg) + opt_temp_swap = atoi(optarg); + else + opt_temp_swap = -1; + break; + + case LONG_SWAP_RAMDISK: + opt_ramdisk_swap = 1; + break; + + case LONG_SWAP_PERSIST: + opt_swap_persist = 1; + + case LONG_LIST_ALL_MOUNTS: + opt_list_mounts = 1; + break; + + case LONG_POOL_LIST: + opt_pool_list = 1; + break; + + case LONG_POOL_MIN_ADJ: + if (minadj_count == MAX_POOLS) { + WARNING("Attempting to adjust an invalid " + "pool or a pool multiple times, " + "ignoring request: '%s'\n", optarg); + } else { + opt_min_adj[minadj_count++] = optarg; + } + break; + + case LONG_POOL_MEMPOL: + opt_obey_mempolicy = 1; + break; + + case LONG_TRANS_ALWAYS: + opt_trans_always = 1; + break; + + case LONG_TRANS_MADVISE: + opt_trans_madvise = 1; + break; + + case LONG_TRANS_NEVER: + opt_trans_never = 1; + break; + + case LONG_KHUGE_PAGES: + opt_khuge_pages = 1; + khuge_pages = optarg; + break; + + case LONG_KHUGE_SCAN: + opt_khuge_scan = 1; + khuge_scan = optarg; + break; + + case LONG_KHUGE_ALLOC: + opt_khuge_alloc = 1; + khuge_alloc = optarg; + break; + + case LONG_POOL_MAX_ADJ: + if (! kernel_has_overcommit()) { + ERROR("kernel does not support overcommit, " + "max cannot be adjusted\n"); + exit(EXIT_FAILURE); + } + + if (maxadj_count == MAX_POOLS) { + WARNING("Attempting to adjust an invalid " + "pool or a pool multiple times, " + "ignoring request: '%s'\n", optarg); + } else { + opt_max_adj[maxadj_count++] = optarg; + } + break; + + case LONG_MOVABLE_ENABLE: + opt_movable = 1; + break; + + case LONG_SET_RECOMMENDED_MINFREEKBYTES: + opt_set_recommended_minfreekbytes = 1; + break; + + case LONG_SET_RECOMMENDED_SHMMAX: + opt_set_recommended_shmmax = 1; + break; + + case LONG_SET_HUGETLB_SHM_GROUP: + opt_grp = getgrnam(optarg); + if (!opt_grp) { + opt_gid = atoi(optarg); + if (opt_gid == 0 && strcmp(optarg, "0")) + group_invalid = 1; + opt_grp = getgrgid(opt_gid); + if (!opt_grp) + group_invalid = 1; + } else { + opt_gid = opt_grp->gr_gid; + } + if (group_invalid) { + ERROR("Invalid group specification (%s)\n", optarg); + exit(EXIT_FAILURE); + } + opt_set_hugetlb_shm_group = 1; + break; + + case LONG_MOVABLE_DISABLE: + opt_movable = 0; + break; + + case LONG_CREATE_MOUNTS: + opt_create_mounts = 1; + break; + + case LONG_CREATE_USER_MOUNTS: + opt_user_mounts = optarg; + break; + + case LONG_CREATE_GROUP_MOUNTS: + opt_group_mounts = optarg; + break; + + case LONG_CREATE_GLOBAL_MOUNTS: + opt_global_mounts = 1; + break; + + case LONG_LIMIT_SIZE: + /* Not a pagesize, but the conversions the same */ + opt_limit_mount_size = parse_page_size(optarg); + if (!opt_limit_mount_size) + WARNING("Mount max size specification 0, invalid or overflowed\n"); + break; + + case LONG_LIMIT_INODES: + opt_limit_mount_inodes = atoi(optarg); + break; + + case LONG_PAGE_SIZES: + opt_pgsizes = 1; + break; + + case LONG_PAGE_AVAIL: + opt_pgsizes_all = 1; + break; + + case LONG_EXPLAIN: + opt_explain = 1; + break; + + default: + WARNING("unparsed option %08x\n", ret); + ret = -1; + break; + } + if (ret != -1) + ops++; + } + + verbose_expose(); + + if (opt_list_mounts) + mounts_list_all(); + + if (opt_pool_list) + pool_list(); + + if (opt_movable != -1) + setup_zone_movable(opt_movable); + + if (opt_trans_always) + set_trans_opt(TRANS_ENABLE, ALWAYS); + + if (opt_trans_madvise) + set_trans_opt(TRANS_ENABLE, MADVISE); + + if (opt_trans_never) + set_trans_opt(TRANS_ENABLE, NEVER); + + if (opt_khuge_pages) + set_trans_opt(KHUGE_SCAN_PAGES, khuge_pages); + + if (opt_khuge_alloc) + set_trans_opt(KHUGE_ALLOC_SLEEP, khuge_alloc); + + if (opt_khuge_scan) + set_trans_opt(KHUGE_SCAN_SLEEP, khuge_scan); + + if (opt_set_recommended_minfreekbytes) + set_recommended_minfreekbytes(); + + if (opt_set_recommended_shmmax) + set_recommended_shmmax(); + + if (opt_set_hugetlb_shm_group) + set_hugetlb_shm_group(opt_gid, opt_grp->gr_name); + + while (--minadj_count >= 0) { + if (! kernel_has_overcommit()) + pool_adjust(opt_min_adj[minadj_count], POOL_BOTH); + else + pool_adjust(opt_min_adj[minadj_count], POOL_MIN); + } + + while (--maxadj_count >=0) + pool_adjust(opt_max_adj[maxadj_count], POOL_MAX); + + if (opt_create_mounts) { + snprintf(base, PATH_MAX, "%s", MOUNT_DIR); + create_mounts(NULL, NULL, base, S_IRWXU | S_IRWXG); + } + + + if (opt_user_mounts != NULL) { + snprintf(base, PATH_MAX, "%s/user", MOUNT_DIR); + create_mounts(opt_user_mounts, NULL, base, S_IRWXU); + } + + if (opt_group_mounts) { + snprintf(base, PATH_MAX, "%s/group", MOUNT_DIR); + create_mounts(NULL, opt_group_mounts, base, S_IRWXG); + } + + if (opt_global_mounts) { + snprintf(base, PATH_MAX, "%s/global", MOUNT_DIR); + create_mounts(NULL, NULL, base, S_IRWXU | S_IRWXG | S_IRWXO); + } + + if (opt_pgsizes) + page_sizes(0); + + if (opt_pgsizes_all) + page_sizes(1); + + if (opt_explain) + explain(); + + index = optind; + + if ((argc - index) != 0 || ops == 0) { + print_usage(); + exit(EXIT_FAILURE); + } + + exit(EXIT_SUCCESS); +} diff --git a/default/libhugetlbfs/libhugetlbfs/hugectl.c b/default/libhugetlbfs/libhugetlbfs/hugectl.c new file mode 100644 index 0000000..741247f --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/hugectl.c @@ -0,0 +1,488 @@ +/*************************************************************************** + * User front end for using huge pages Copyright (C) 2008, IBM * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the Lesser GNU General Public License as * + * published by the Free Software Foundation; either version 2.1 of the * + * License, or at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the Lesser GNU General Public * + * License along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + +/* + * hugectl is inspired by numactl as a single front end to a large number of + * options for controlling a very specific environment. Eventually it will + * have support for controlling the all of the environment variables for + * libhugetlbfs, but options will only be added after they have been in the + * library for some time and are throughly tested and stable. + * + * This program should be treated as an ABI for using libhugetlbfs. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <errno.h> +#include <string.h> +#include <limits.h> + +#define _GNU_SOURCE /* for getopt_long */ +#include <unistd.h> +#include <getopt.h> + +#define REPORT(level, prefix, format, ...) \ + do { \ + if (verbose_level >= level) \ + fprintf(stderr, "hugectl: " prefix ": " format, \ + ##__VA_ARGS__); \ + } while (0); + +#include "libhugetlbfs_debug.h" + +extern int errno; +extern int optind; +extern char *optarg; + +#define OPTION(opts, text) fprintf(stderr, " %-25s %s\n", opts, text) +#define CONT(text) fprintf(stderr, " %-25s %s\n", "", text) + +void print_usage() +{ + fprintf(stderr, "hugectl [options] target\n"); + fprintf(stderr, "options:\n"); + + OPTION("--help, -h", "Prints this message"); + OPTION("--verbose <level>, -v", "Increases/sets tracing levels"); + + OPTION("--text[=<size>]", "Requests remapping of the program text"); + OPTION("--data[=<size>]", "Requests remapping of the program data"); + OPTION("--bss[=<size>]", "Requests remapping of the program bss"); + OPTION("--heap[=<size>]", "Requests remapping of the program heap"); + CONT("(malloc space)"); + OPTION("--shm", "Requests remapping of shared memory segments"); + OPTION("--thp", "Setup the heap space to be aligned for merging"); + CONT("by khugepaged into huge pages. This requires"); + CONT("kernel support for transparent huge pages to be"); + CONT("enabled"); + + OPTION("--no-preload", "Disable preloading the libhugetlbfs library"); + OPTION("--no-reserve", "Disable huge page reservation for segments"); + OPTION("--force-preload", "Force preloading the libhugetlbfs library"); + + OPTION("--dry-run", "describe what would be done without doing it"); + + OPTION("--library-use-path", "Use the system library path"); + OPTION("--share-text", "Share text segments between multiple"); + CONT("application instances"); + OPTION("--library-path <path>", "Select a library prefix"); + CONT("(Default: " +#ifdef LIBDIR32 + LIBDIR32 ":" +#endif +#ifdef LIBDIR32 + LIBDIR32 ":" +#endif + ")"); +} + +int opt_dry_run = 0; +int opt_force_preload = 0; +int verbose_level = VERBOSITY_DEFAULT; + +void verbose_init(void) +{ + char *env; + + env = getenv("HUGETLB_VERBOSE"); + if (env) + verbose_level = atoi(env); + env = getenv("HUGETLB_DEBUG"); + if (env) + verbose_level = VERBOSITY_MAX; +} + +void verbose(char *which) +{ + int new_level; + + if (which) { + new_level = atoi(which); + if (new_level < 0 || new_level > 99) { + ERROR("%d: verbosity out of range 0-99\n", + new_level); + exit(EXIT_FAILURE); + } + } else { + new_level = verbose_level + 1; + if (new_level == 100) { + WARNING("verbosity limited to 99\n"); + new_level--; + } + } + verbose_level = new_level; +} + +void quiet(void) +{ + int new_level = verbose_level - 1; + if (new_level < 0) { + WARNING("verbosity must be at least 0\n"); + new_level = 0; + } + verbose_level = new_level; +} + +void setup_environment(char *var, char *val) +{ + setenv(var, val, 1); + INFO("%s='%s'\n", var, val); + + if (opt_dry_run) + printf("%s='%s'\n", var, val); +} + +void verbose_expose(void) +{ + char level[3]; + + if (verbose_level == 99) { + setup_environment("HUGETLB_DEBUG", "yes"); + } + snprintf(level, sizeof(level), "%d", verbose_level); + setup_environment("HUGETLB_VERBOSE", level); +} + +/* + * getopts return values for options which are long only. + */ +#define MAP_BASE 0x1000 +#define LONG_BASE 0x2000 + +#define LONG_NO_PRELOAD (LONG_BASE | 'p') +#define LONG_NO_RESERVE (LONG_BASE | 'r') +#define LONG_FORCE_PRELOAD (LONG_BASE | 'F') + +#define LONG_DRY_RUN (LONG_BASE | 'd') + +#define LONG_SHARE (LONG_BASE | 's') +#define LONG_NO_LIBRARY (LONG_BASE | 'L') +#define LONG_LIBRARY (LONG_BASE | 'l') + +#define LONG_THP_HEAP ('t') + +/* + * Mapping selectors, one per remappable/backable area as requested + * by the user. These are also used as returns from getopts where they + * are offset from MAP_BASE, which must be removed before they are compared. + */ +enum { + MAP_TEXT, + MAP_DATA, + MAP_BSS, + MAP_HEAP, + MAP_SHM, + MAP_DISABLE, + + MAP_COUNT, +}; +char *map_size[MAP_COUNT]; + +char default_size[] = "the default hugepage size"; +#define DEFAULT_SIZE default_size + +#define available(buf, ptr) ((int)(sizeof(buf) - (ptr - buf))) +void setup_mappings(int count) +{ + char value[128]; + char *ptr = value; + int needed; + + /* + * HUGETLB_ELFMAP should be set to either a combination of 'R' and 'W' + * which indicate which segments should be remapped. Each may take + * an optional page size. It may also be set to 'no' to prevent + * remapping. + */ + + /* + * Accumulate sections each with a ':' prefix to simplify later + * handling. We will elide the initial ':' before use. + */ + if (map_size[MAP_TEXT]) { + if (map_size[MAP_TEXT] == DEFAULT_SIZE) + needed = snprintf(ptr, available(value, ptr), ":R"); + else + needed = snprintf(ptr, available(value, ptr), + ":R=%s", map_size[MAP_TEXT]); + ptr += needed; + if (needed < 0 || available(value, ptr) < 0) { + ERROR("%s: bad size specification\n", map_size[MAP_TEXT]); + exit(EXIT_FAILURE); + } + } + if (map_size[MAP_DATA] != 0 || map_size[MAP_BSS] != 0) { + char *size = map_size[MAP_BSS]; + if (map_size[MAP_DATA]) + size = map_size[MAP_DATA]; + if (map_size[MAP_DATA] != map_size[MAP_BSS]) + WARNING("data and bss remapped together in %s\n", size); + + if (size == DEFAULT_SIZE) + needed = snprintf(ptr, available(value, ptr), ":W"); + else + needed = snprintf(ptr, available(value, ptr), + ":W=%s", size); + ptr += needed; + if (needed < 0 || available(value, ptr) < 0) { + ERROR("%s: bad size specification\n", size); + exit(EXIT_FAILURE); + } + } + *ptr = '\0'; + if (ptr != value) + setup_environment("HUGETLB_ELFMAP", &value[1]); + + if (map_size[MAP_DISABLE]) { + if (ptr != value) + WARNING("--disable masks requested remap\n"); + setup_environment("HUGETLB_ELFMAP", "no"); + } + + if (map_size[MAP_HEAP] == DEFAULT_SIZE) + setup_environment("HUGETLB_MORECORE", "yes"); + else if (map_size[MAP_HEAP]) + setup_environment("HUGETLB_MORECORE", map_size[MAP_HEAP]); + + if (map_size[MAP_SHM] && map_size[MAP_SHM] != DEFAULT_SIZE) + WARNING("shm segments may only be mapped in the " + "default hugepage size\n"); + if (map_size[MAP_SHM]) + setup_environment("HUGETLB_SHM", "yes"); +} + +#define LIBRARY_DISABLE ((void *)-1) + +void library_path(char *path) +{ + char val[NAME_MAX] = ""; + char *env; + + env = getenv("LD_LIBRARY_PATH"); + + /* + * Select which libraries we wish to use. If the path is NULL + * use the libraries included with hugectl. If the path is valid + * and points to a directory including a libhugetlbfs.so use it + * directly. Else path is assumed to be a prefix to the 32/64 bit + * directories both of which are added, where available. + */ + if (path) { + snprintf(val, sizeof(val), "%s/libhugetlbfs.so", path); + if (access(val, F_OK) == 0) { + /* $PATH */ + snprintf(val, sizeof(val), "%s:%s", + path, env ? env : ""); + + } else { + /* [$PATH/LIB32:][$PATH/LIB64:]$LD_LIBRARY_PATH */ + snprintf(val, sizeof(val), "" +#ifdef LIBDIR32 + "%s/" LIB32 ":" +#endif +#ifdef LIBDIR64 + "%s/" LIB64 ":" +#endif + "%s", +#ifdef LIBDIR32 + path, +#endif +#ifdef LIBDIR64 + path, +#endif + env ? env : ""); + } + + } else { + /* [LIBDIR32:][LIBDIR64:]$LD_LIBRARY_PATH */ + snprintf(val, sizeof(val), "" +#ifdef LIBDIR32 + LIBDIR32 ":" +#endif +#ifdef LIBDIR64 + LIBDIR64 ":" +#endif + "%s", env ? env : ""); + } + setup_environment("LD_LIBRARY_PATH", val); +} + +void ldpreload(int count) +{ + int allowed = 0; + + if (map_size[MAP_HEAP]) + allowed++; + if (map_size[MAP_SHM]) + allowed++; + + if ((allowed == count) || opt_force_preload) { + setup_environment("LD_PRELOAD", "libhugetlbfs.so"); + if (allowed == count) + INFO("LD_PRELOAD in use for lone --heap/--shm\n"); + } else { + WARNING("LD_PRELOAD not appropriate for this map combination\n"); + } +} + +int main(int argc, char** argv) +{ + int opt_mappings = 0; + int opt_preload = 1; + int opt_no_reserve = 0; + int opt_share = 0; + int opt_thp_heap = 0; + char *opt_library = NULL; + + char opts[] = "+hvq"; + int ret = 0, index = 0; + struct option long_opts[] = { + {"help", no_argument, NULL, 'h'}, + {"verbose", required_argument, NULL, 'v' }, + {"no-preload", no_argument, NULL, LONG_NO_PRELOAD}, + {"no-reserve", no_argument, NULL, LONG_NO_RESERVE}, + {"force-preload", + no_argument, NULL, LONG_FORCE_PRELOAD}, + {"dry-run", no_argument, NULL, LONG_DRY_RUN}, + {"library-path", + required_argument, NULL, LONG_LIBRARY}, + {"library-use-path", + no_argument, NULL, LONG_NO_LIBRARY}, + {"share-text", no_argument, NULL, LONG_SHARE}, + + {"disable", optional_argument, NULL, MAP_BASE|MAP_DISABLE}, + {"text", optional_argument, NULL, MAP_BASE|MAP_TEXT}, + {"data", optional_argument, NULL, MAP_BASE|MAP_DATA}, + {"bss", optional_argument, NULL, MAP_BASE|MAP_BSS}, + {"heap", optional_argument, NULL, MAP_BASE|MAP_HEAP}, + {"shm", optional_argument, NULL, MAP_BASE|MAP_SHM}, + {"thp", no_argument, NULL, LONG_THP_HEAP}, + {0}, + }; + + verbose_init(); + + while (ret != -1) { + ret = getopt_long(argc, argv, opts, long_opts, &index); + if (ret > 0 && (ret & MAP_BASE)) { + if (optarg) + map_size[ret & ~MAP_BASE] = optarg; + else + map_size[ret & ~MAP_BASE] = DEFAULT_SIZE; + opt_mappings++; + continue; + } + switch (ret) { + case '?': + print_usage(); + exit(EXIT_FAILURE); + + case 'h': + print_usage(); + exit(EXIT_SUCCESS); + + case 'v': + verbose(optarg); + break; + + case 'q': + quiet(); + break; + + case LONG_THP_HEAP: + opt_thp_heap = 1; + INFO("Aligning heap for use with THP\n"); + break; + + case LONG_NO_PRELOAD: + opt_preload = 0; + INFO("LD_PRELOAD disabled\n"); + break; + + case LONG_NO_RESERVE: + opt_no_reserve = 1; + INFO("MAP_NORESERVE used for huge page mappings\n"); + break; + + case LONG_FORCE_PRELOAD: + opt_preload = 1; + opt_force_preload = 1; + INFO("Forcing ld preload\n"); + break; + + case LONG_DRY_RUN: + opt_dry_run = 1; + break; + + case LONG_NO_LIBRARY: + opt_library = LIBRARY_DISABLE; + INFO("using LD_LIBRARY_PATH to find library\n"); + break; + + case LONG_LIBRARY: + opt_library = optarg; + break; + + case LONG_SHARE: + opt_share = 1; + break; + + case -1: + break; + + default: + WARNING("unparsed option %08x\n", ret); + ret = -1; + break; + } + } + index = optind; + + if (!opt_dry_run && (argc - index) < 1) { + print_usage(); + exit(EXIT_FAILURE); + } + + verbose_expose(); + + if (opt_library != LIBRARY_DISABLE) + library_path(opt_library); + + if (opt_mappings) + setup_mappings(opt_mappings); + + if (opt_preload) + ldpreload(opt_mappings); + + if (opt_no_reserve) + setup_environment("HUGETLB_NO_RESERVE", "yes"); + + if (opt_share) + setup_environment("HUGETLB_SHARE", "1"); + + if (opt_thp_heap) + setup_environment("HUGETLB_MORECORE", "thp"); + + if (opt_dry_run) + exit(EXIT_SUCCESS); + + execvp(argv[index], &argv[index]); + ERROR("exec failed: %s\n", strerror(errno)); + exit(EXIT_FAILURE); +} diff --git a/default/libhugetlbfs/libhugetlbfs/hugeedit.c b/default/libhugetlbfs/libhugetlbfs/hugeedit.c new file mode 100644 index 0000000..2785200 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/hugeedit.c @@ -0,0 +1,240 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 Adam Litke, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <elf.h> +#include <link.h> +#include <getopt.h> +#include <errno.h> +#include <sys/stat.h> +#include <sys/mman.h> + +/* + * Eventually we plan to use the libhugetlbfs reporting facility, + * but until that is possible, redefine a simpler version here. + */ +#define REPORT(level, prefix, format, ...) \ + do { \ + fprintf(stderr, "hugeedit: " prefix ": " format, \ + ##__VA_ARGS__); \ + } while (0) + +#include "libhugetlbfs_internal.h" + +/* + * All MAP_* options are tagged with MAP_BASE to differentiate them as options + * in the options parser. This must be removed before they are compared. + */ +#define MAP_BASE 0x1000 +#define MAP_DISABLE 0x0001 +#define MAP_TEXT 0x0002 +#define MAP_DATA 0x0004 + +#define PF_LINUX_HUGETLB 0x100000 +extern int optind; +extern char *optarg; + +#define OPTION(opts, text) fprintf(stderr, " %-25s %s\n", opts, text) +#define CONT(text) fprintf(stderr, " %-25s %s\n", "", text) + +void print_usage() +{ + fprintf(stderr, "hugeedit [options] target\n"); + fprintf(stderr, "options:\n"); + OPTION("--text", "Remap program text into huge pages by default"); + OPTION("--data", "Remap program data into huge pages by default"); + OPTION("--disable", "Remap no segments into huge pages by default"); + OPTION("--help, -h", "Print this usage information"); +} + +int check_elf_wordsize(void *ehdr) +{ + char *e_ident = (char *) ehdr; + + if (strncmp(e_ident, ELFMAG, SELFMAG)) { + ERROR("Not a valid ELF executable\n"); + exit(EXIT_FAILURE); + } + + switch (e_ident[EI_CLASS]) { + case ELFCLASS32: + case ELFCLASS64: + return e_ident[EI_CLASS]; + default: + ERROR("Can not determine word size\n"); + exit(EXIT_FAILURE); + } +} + +/* + * We need to map enough of the binary so that we can access the ELF header and + * all of the program headers. This function takes a pointer to the first page + * of ELF headers which is guaranteed to be enough data to determine if we need + * to map more of the binary. Use mremap to enlarge the mapping if needed. + * + * void **elf - may be updated with a new address if mremap moved it + * unsigned long *size - may be updated with the new mapping size + */ +#define elf_ph_end_offset(e) ((e)->e_phoff + (e)->e_phentsize * (e)->e_phnum) +void check_remap_elf(void **elf, unsigned long *size, int wordsize) +{ + unsigned long newsize; + int pagesize = getpagesize(); + + if (wordsize == ELFCLASS32) { + Elf32_Ehdr *ehdr = *(Elf32_Ehdr **) elf; + newsize = elf_ph_end_offset(ehdr); + } else { + Elf64_Ehdr *ehdr = *(Elf64_Ehdr **) elf; + newsize = elf_ph_end_offset(ehdr); + } + newsize = ALIGN_UP(newsize, pagesize); + + if (newsize > *size) { + *size = newsize; + *elf = mremap(*elf, *size, newsize, MREMAP_MAYMOVE); + if (*elf == MAP_FAILED) { + ERROR("Remapping failed: %s\n", strerror(errno)); + exit(EXIT_FAILURE); + } + } +} + +#define is_text(p) ((((p)->p_flags & (PF_R|PF_W|PF_X)) == (PF_R|PF_X)) && \ + ((p)->p_memsz == (p)->p_filesz)) +#define is_data(p) (((p)->p_flags & (PF_R|PF_W|PF_X)) == (PF_R|PF_W)) + +#define update_phdrs(_BITS_) \ +void update_phdrs##_BITS_(Elf##_BITS_##_Ehdr *ehdr, int remap_opts) \ +{ \ + int i; \ + Elf##_BITS_##_Phdr *phdr; \ + unsigned long long start, end; \ + \ + phdr = (Elf##_BITS_##_Phdr *)((char *)ehdr + ehdr->e_phoff); \ + for (i = 0; i < ehdr->e_phnum; i++) { \ + if (phdr[i].p_type != PT_LOAD) \ + continue; \ + if (remap_opts) \ + phdr[i].p_flags &= ~PF_LINUX_HUGETLB; \ + if ((remap_opts & MAP_TEXT) && is_text(&phdr[i])) \ + phdr[i].p_flags |= PF_LINUX_HUGETLB; \ + if ((remap_opts & MAP_DATA) && is_data(&phdr[i])) \ + phdr[i].p_flags |= PF_LINUX_HUGETLB; \ + start = (unsigned long long) phdr[i].p_vaddr; \ + end = start + phdr[i].p_memsz; \ + printf("Segment %i 0x%llx - 0x%llx (%s%s) default is " \ + "%s pages\n", i, start, end, \ + is_text(&phdr[i]) ? "TEXT" : "", \ + is_data(&phdr[i]) ? "DATA" : "", \ + (phdr[i].p_flags & PF_LINUX_HUGETLB) ? \ + "HUGE" : "BASE"); \ + } \ +} +update_phdrs(32) +update_phdrs(64) + +int main(int argc, char ** argv) +{ + char opts[] = "+h"; + struct option long_opts[] = { + {"help", no_argument, NULL, 'h'}, + {"disable", no_argument, NULL, MAP_BASE|MAP_DISABLE}, + {"text", no_argument, NULL, MAP_BASE|MAP_TEXT}, + {"data", no_argument, NULL, MAP_BASE|MAP_DATA}, + {0}, + }; + int ret = 0, index = 0, remap_opts = 0; + int fd; + const char *target; + void *ehdr; + unsigned long mapsize = getpagesize(); + int target_wordsize; + + while (ret != -1) { + ret = getopt_long(argc, argv, opts, long_opts, &index); + if (ret > 0 && (ret & MAP_BASE)) { + remap_opts |= ret; + continue; + } + switch (ret) { + case '?': + print_usage(); + exit(EXIT_FAILURE); + case 'h': + print_usage(); + exit(EXIT_SUCCESS); + + default: + ret = -1; + break; + } + } + index = optind; + remap_opts &= ~MAP_BASE; + if (remap_opts & MAP_DISABLE && remap_opts != MAP_DISABLE) { + ERROR("--disable is not compatible with --text or --data\n"); + exit(EXIT_FAILURE); + } + + if ((argc - index) != 1) { + print_usage(); + exit(EXIT_FAILURE); + } + target = argv[index]; + + /* We don't need write access unless we plan to alter the binary */ + fd = open(target, (remap_opts ? O_RDWR : O_RDONLY)); + if (fd < 0) { + ERROR("Opening %s failed: %s\n", target, strerror(errno)); + exit(EXIT_FAILURE); + } + + ehdr = mmap(NULL, mapsize, PROT_READ | (remap_opts ? PROT_WRITE : 0), + MAP_SHARED, fd, 0); + if (ehdr == MAP_FAILED) { + ERROR("Mapping %s failed: %s\n", target, strerror(errno)); + exit(EXIT_FAILURE); + } + + target_wordsize = check_elf_wordsize(ehdr); + check_remap_elf(&ehdr, &mapsize, target_wordsize); + if (target_wordsize == ELFCLASS64) + update_phdrs64((Elf64_Ehdr *) ehdr, remap_opts); + else + update_phdrs32((Elf32_Ehdr *) ehdr, remap_opts); + + if (munmap(ehdr, mapsize) != 0) { + ERROR("Unmapping %s failed: %s\n", target, strerror(errno)); + exit(EXIT_FAILURE); + } + if (close(fd) != 0) { + ERROR("Final close of %s failed: %s -- possible data loss!\n", + target, strerror(errno)); + exit(EXIT_FAILURE); + } + + exit(EXIT_SUCCESS); +} diff --git a/default/libhugetlbfs/libhugetlbfs/hugetlbfs.h b/default/libhugetlbfs/libhugetlbfs/hugetlbfs.h new file mode 100644 index 0000000..ecd178b --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/hugetlbfs.h @@ -0,0 +1,79 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * This file should only contain definitions of functions, data types, and + * constants which are part of the published libhugetlfs API. Functions + * exported here must also be listed in version.lds. + */ + +#ifndef _HUGETLBFS_H +#define _HUGETLBFS_H + +#define HUGETLBFS_MAGIC 0x958458f6 + +long gethugepagesize(void); +int gethugepagesizes(long pagesizes[], int n_elem); +int getpagesizes(long pagesizes[], int n_elem); +int hugetlbfs_test_path(const char *mount); +const char *hugetlbfs_find_path(void); +const char *hugetlbfs_find_path_for_size(long page_size); +int hugetlbfs_unlinked_fd(void); +int hugetlbfs_unlinked_fd_for_size(long page_size); + +#define PF_LINUX_HUGETLB 0x100000 + +/* + * Direct hugepage allocation flags and types + * + * GHP_DEFAULT - Use the default hugepage size to back the region + */ +typedef unsigned long ghp_t; +#define GHP_DEFAULT ((ghp_t)0x01UL) +#define GHP_MASK (GHP_DEFAULT) + +/* Direct alloc functions for hugepages */ +void *get_huge_pages(size_t len, ghp_t flags); +void free_huge_pages(void *ptr); + +/* + * Region alloc flags and types + * + * GHR_DEFAULT - Use a combination of flags deemed to be a sensible default + * by the current implementation of the library + * GHR_FALLBACK - Use the default hugepage size if possible but fallback to + * smaller pages if necessary + * GHR_STRICT - Use hugepages of some size or return NULL + * GHP_COLOR - Use bytes wasted due to alignment to offset the buffer + * by a random cache line. This gives better average + * performance with many buffers + */ +typedef unsigned long ghr_t; +#define GHR_STRICT ((ghr_t)0x10000000U) +#define GHR_FALLBACK ((ghr_t)0x20000000U) +#define GHR_COLOR ((ghr_t)0x40000000U) +#define GHR_DEFAULT (GHR_FALLBACK|GHR_COLOR) + +#define GHR_MASK (GHR_FALLBACK|GHR_STRICT|GHR_COLOR) + +/* Allocation functions for regions backed by hugepages */ +void *get_hugepage_region(size_t len, ghr_t flags); +void free_hugepage_region(void *ptr); + +#endif /* _HUGETLBFS_H */ diff --git a/default/libhugetlbfs/libhugetlbfs/hugeutils.c b/default/libhugetlbfs/libhugetlbfs/hugeutils.c new file mode 100644 index 0000000..f8e2b33 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/hugeutils.c @@ -0,0 +1,1184 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _LARGEFILE64_SOURCE /* Need this for statfs64 */ +#define _GNU_SOURCE +#include <dlfcn.h> +#include <features.h> + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <errno.h> +#include <limits.h> +#include <string.h> +#include <ctype.h> +#include <signal.h> +#include <dirent.h> + +#include <unistd.h> +#include <fcntl.h> +#include <sys/vfs.h> +#include <sys/statfs.h> +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/file.h> +#include <sys/uio.h> +#include <sys/syscall.h> +#include <linux/types.h> +#include <linux/unistd.h> +#include <dirent.h> + +#include "libhugetlbfs_internal.h" +#include "hugetlbfs.h" + +struct libhugeopts_t __hugetlb_opts; + +static int hugepagesize_errno; /* = 0 */ + +#define MAX_HPAGE_SIZES 10 +static struct hpage_size hpage_sizes[MAX_HPAGE_SIZES]; +static int nr_hpage_sizes; +static int hpage_sizes_default_idx = -1; + +static long default_size; + +/********************************************************************/ +/* Internal functions */ +/********************************************************************/ + +/* + * Lookup the kernel default page size. + */ +long kernel_default_hugepage_size() +{ + if (default_size == 0) { + default_size = file_read_ulong(MEMINFO, "Hugepagesize:"); + default_size = size_to_smaller_unit(default_size); /* kB to B */ } + return default_size; +} +void kernel_default_hugepage_size_reset(void) +{ + default_size = 0; +} + +#define BUF_SZ 256 +#define MEMINFO_SIZE 2048 + +/* + * Convert a quantity in a given unit to the next smallest unit by + * multiplying the quantity by 1024 (eg. convert 1MB to 1024kB). + * If the conversion would overflow the variable, return ULONGLONG_MAX to + * signify the error. + */ +unsigned long long size_to_smaller_unit(unsigned long long size) +{ + if (size * 1024 < size) + return -1; + else + return size * 1024; +} + +/* + * Convert a page size string with an optional unit suffix into a page size + * in bytes. + * + * On error, -1 is returned and errno is set appropriately: + * EINVAL - str could not be parsed or was not greater than zero + * EOVERFLOW - Overflow when converting from the specified units + */ +long parse_page_size(const char *str) +{ + char *pos; + long size; + + errno = 0; + size = strtol(str, &pos, 0); + /* Catch strtoul errors and sizes that overflow the native word size */ + if (errno || str == pos || size <= 0) { + if (errno == ERANGE) + errno = EOVERFLOW; + else + errno = EINVAL; + return -1; + } + + switch (*pos) { + case 'G': + case 'g': + size = size_to_smaller_unit(size); + case 'M': + case 'm': + size = size_to_smaller_unit(size); + case 'K': + case 'k': + size = size_to_smaller_unit(size); + } + + if (size < 0) + errno = EOVERFLOW; + return size; +} + +struct hugetlb_pool_counter_info_t { + char *meminfo_key; + char *sysfs_file; +}; + +static struct hugetlb_pool_counter_info_t hugetlb_counter_info[] = { + [HUGEPAGES_TOTAL] = { + .meminfo_key = "HugePages_Total:", + .sysfs_file = "nr_hugepages", + }, + [HUGEPAGES_TOTAL_MEMPOL] = { + .meminfo_key = "HugePages_Total:", + .sysfs_file = "nr_hugepages_mempolicy", + }, + [HUGEPAGES_FREE] = { + .meminfo_key = "HugePages_Free:", + .sysfs_file = "free_hugepages", + }, + [HUGEPAGES_RSVD] = { + .meminfo_key = "HugePages_Rsvd:", + .sysfs_file = "resv_hugepages", + }, + [HUGEPAGES_SURP] = { + .meminfo_key = "HugePages_Surp:", + .sysfs_file = "surplus_hugepages", + }, + [HUGEPAGES_OC] = { + .meminfo_key = NULL, + .sysfs_file = "nr_overcommit_hugepages" + }, +}; + +/* + * Read numeric data from raw and tagged kernel status files. Used to read + * /proc and /sys data (without a tag) and from /proc/meminfo (with a tag). + */ +long file_read_ulong(char *file, const char *tag) +{ + int fd; + char buf[MEMINFO_SIZE]; + int len, readerr; + char *p, *q; + long val; + + fd = open(file, O_RDONLY); + if (fd < 0) { + ERROR("Couldn't open %s: %s\n", file, strerror(errno)); + return -1; + } + + len = read(fd, buf, sizeof(buf)); + readerr = errno; + close(fd); + if (len < 0) { + ERROR("Error reading %s: %s\n", file, strerror(readerr)); + return -1; + } + if (len == sizeof(buf)) { + ERROR("%s is too large\n", file); + return -1; + } + buf[len] = '\0'; + + /* Search for a tag if provided */ + if (tag) { + p = strstr(buf, tag); + if (!p) + return -1; /* looks like the line we want isn't there */ + p += strlen(tag); + } else + p = buf; + + val = strtol(p, &q, 0); + if (! isspace(*q)) { + ERROR("Couldn't parse %s value\n", file); + return -1; + } + + return val; +} + +int file_write_ulong(char *file, unsigned long val) +{ + FILE *f; + int ret; + + f = fopen(file, "w"); + if (!f) { + ERROR("Couldn't open %s: %s\n", file, strerror(errno)); + return -1; + } + + ret = fprintf(f, "%lu", val); + fclose(f); + return ret > 0 ? 0 : -1; +} + + +/* + * Return the name of this executable, using buf as temporary space. + */ +#define MAX_EXE 4096 +static char *get_exe_name(char *buf, int size) +{ + char *p; + int fd; + ssize_t nread; + + buf[0] = 0; + fd = open("/proc/self/cmdline", O_RDONLY); + if (fd < 0) { + WARNING("Unable to open cmdline, no exe name\n"); + return buf; + } + nread = read(fd, buf, size-1); + close(fd); + + if (nread < 0) { + WARNING("Error %d reading cmdline, no exe name\n", errno); + return buf; + } + if (nread == 0) { + WARNING("Read zero bytes from cmdline, no exe name\n"); + return buf; + } + + buf[nread] = 0; /* make sure we're null terminated */ + /* + * Take advantage of cmdline being a series of null-terminated + * strings. The first string is the path to the executable in + * the form: + * + * /path/to/exe + * + * The exe name starts one character after the last '/'. + */ + p = strrchr(buf, '/'); + if (!p) + return buf; + return p + 1; /* skip over "/" */ +} + + +/* + * Reads the contents of hugetlb environment variables and save their + * values for later use. + */ +void hugetlbfs_setup_env() +{ + char *env; + + __hugetlb_opts.min_copy = true; + + env = getenv("HUGETLB_VERBOSE"); + if (env) + __hugetlbfs_verbose = atoi(env); + + env = getenv("HUGETLB_DEBUG"); + if (env) { + __hugetlbfs_debug = true; + __hugetlbfs_verbose = VERBOSE_DEBUG; + } + + env = getenv("HUGETLB_RESTRICT_EXE"); + if (env) { + char *p, *tok, *exe, buf[MAX_EXE+1], restrict[MAX_EXE]; + int found = 0; + + exe = get_exe_name(buf, sizeof buf); + DEBUG("Found HUGETLB_RESTRICT_EXE, this exe is \"%s\"\n", exe); + strncpy(restrict, env, sizeof restrict); + restrict[sizeof(restrict)-1] = 0; + for (p = restrict; (tok = strtok(p, ":")) != NULL; p = NULL) { + DEBUG(" ...check exe match for \"%s\"\n", tok); + if (strcmp(tok, exe) == 0) { + found = 1; + DEBUG("exe match - libhugetlbfs is active for this exe\n"); + break; + } + } + if (!found) { + DEBUG("No exe match - libhugetlbfs is inactive for this exe\n"); + return; + } + } + + env = getenv("HUGETLB_NO_PREFAULT"); + if (env) + __hugetlbfs_prefault = false; + + __hugetlb_opts.share_path = getenv("HUGETLB_SHARE_PATH"); + __hugetlb_opts.elfmap = getenv("HUGETLB_ELFMAP"); + __hugetlb_opts.ld_preload = getenv("LD_PRELOAD"); + __hugetlb_opts.def_page_size = getenv("HUGETLB_DEFAULT_PAGE_SIZE"); + __hugetlb_opts.path = getenv("HUGETLB_PATH"); + __hugetlb_opts.features = getenv("HUGETLB_FEATURES"); + __hugetlb_opts.morecore = getenv("HUGETLB_MORECORE"); + __hugetlb_opts.heapbase = getenv("HUGETLB_MORECORE_HEAPBASE"); + + if (__hugetlb_opts.morecore) + __hugetlb_opts.thp_morecore = + (strcasecmp(__hugetlb_opts.morecore, "thp") == 0); + + if (__hugetlb_opts.thp_morecore && __hugetlb_opts.heapbase) { + DEBUG("Heapbase specified with THP for morecore, ignoring heapbase\n"); + __hugetlb_opts.heapbase = NULL; + } + + env = getenv("HUGETLB_FORCE_ELFMAP"); + if (env && (strcasecmp(env, "yes") == 0)) + __hugetlb_opts.force_elfmap = 1; + + env = getenv("HUGETLB_MINIMAL_COPY"); + if (__hugetlb_opts.min_copy && env && (strcasecmp(env, "no") == 0)) { + INFO("HUGETLB_MINIMAL_COPY=%s, disabling filesz copy " + "optimization\n", env); + __hugetlb_opts.min_copy = false; + } + + env = getenv("HUGETLB_SHARE"); + if (env) + __hugetlb_opts.sharing = atoi(env); + + /* + * We have been seeing some unexpected behavior from malloc when + * heap shrinking is enabled, so heap shrinking is disabled by + * default. + * + * If malloc has been called successfully before setup_morecore, + * glibc will notice a gap between the previous top-of-heap and + * the new top-of-heap when it calls hugetlbfs_morecore. It treats + * this as a "foreign sbrk." Unfortunately, the "foreign sbrk" + * handling code will then immediately try to free the memory + * allocated by hugetlbfs_morecore! + * + * This behavior has been reported to the ptmalloc2 maintainer, + * along with a patch to correct the behavior. + */ + env = getenv("HUGETLB_MORECORE_SHRINK"); + if (env && strcasecmp(env, "yes") == 0) + __hugetlb_opts.shrink_ok = true; + + /* Determine if shmget() calls should be overridden */ + env = getenv("HUGETLB_SHM"); + if (env && !strcasecmp(env, "yes")) + __hugetlb_opts.shm_enabled = true; + + /* Determine if all reservations should be avoided */ + env = getenv("HUGETLB_NO_RESERVE"); + if (env && !strcasecmp(env, "yes")) + __hugetlb_opts.no_reserve = true; +} + +void hugetlbfs_setup_kernel_page_size() +{ + long page_size = kernel_default_hugepage_size(); + + if (page_size <= 0) { + WARNING("Unable to find default kernel huge page size\n"); + return; + } + + INFO("Found pagesize %ld kB\n", page_size / 1024); + hpage_sizes[0].pagesize = page_size; + + nr_hpage_sizes = 1; +} + +void hugetlbfs_check_priv_resv() +{ + /* + * If the kernel supports MAP_PRIVATE reservations, we can skip + * prefaulting the huge pages we allocate since the kernel + * guarantees them. This can help NUMA performance quite a bit. + */ + if (hugetlbfs_test_feature(HUGETLB_FEATURE_PRIVATE_RESV) > 0) { + INFO("Kernel has MAP_PRIVATE reservations. Disabling " + "heap prefaulting.\n"); + __hugetlbfs_prefault = false; + } +} + +void hugetlbfs_check_safe_noreserve() +{ + /* + * Some kernels will trigger an OOM if MAP_NORESERVE is used and + * a huge page allocation fails. This is unfortunate so limit + * the user of NORESERVE where necessary + */ + if (__hugetlb_opts.no_reserve && + hugetlbfs_test_feature(HUGETLB_FEATURE_SAFE_NORESERVE) <= 0) { + INFO("Kernel is not safe for MAP_NORESERVE. Forcing " + "use of reservations.\n"); + __hugetlb_opts.no_reserve = false; + } +} + +void hugetlbfs_check_map_hugetlb() +{ +/* + * FIXME: MAP_HUGETLB has not been picked up by glibc so even though the + * kernel may support it, without the userspace mmap flag it cannot be + * used. This ifdef should be removed when the MAP_HUGETLB flag makes it + * into glibc. + */ +#ifdef MAP_HUGETLB + /* + * Kernels after 2.6.32 support mmaping pseudo-anonymous regions + * backed by huge pages, use this feature for huge pages we + * don't intend to share. + */ + if (hugetlbfs_test_feature(HUGETLB_FEATURE_MAP_HUGETLB) > 0) { + INFO("Kernel supports MAP_HUGETLB\n"); + __hugetlb_opts.map_hugetlb = true; + } +#endif +} + +/* + * Pool counters are typically exposed in sysfs in modern kernels, the + * counters for the default page size are exposed in procfs in all kernels + * supporting hugepages. Given a specific counter (e.g. HUGEPAGES_RSVD) + * and a page size return both a filename and an optional tag to locate + * and extract this counter. + */ +static int select_pool_counter(unsigned int counter, unsigned long pagesize, + char *filename, char **key) +{ + long default_size; + char *meminfo_key; + char *sysfs_file; + + if (counter >= HUGEPAGES_MAX_COUNTERS) { + ERROR("Invalid counter specified\n"); + return -1; + } + + meminfo_key = hugetlb_counter_info[counter].meminfo_key; + sysfs_file = hugetlb_counter_info[counter].sysfs_file; + if (key) + *key = NULL; + + /* + * Get the meminfo page size. + * This could be made more efficient if utility functions were shared + * between libhugetlbfs and the test suite. For now we will just + * read /proc/meminfo. + */ + default_size = kernel_default_hugepage_size(); + if (default_size < 0) { + ERROR("Cannot determine the default page size\n"); + return -1; + } + + /* If the user is dealing in the default page size, we can use /proc */ + if (pagesize == default_size) { + if (meminfo_key && key) { + strcpy(filename, MEMINFO); + *key = meminfo_key; + } else + sprintf(filename, PROC_HUGEPAGES_DIR "%s", sysfs_file); + } else /* Use the sysfs interface */ + sprintf(filename, SYSFS_HUGEPAGES_DIR "hugepages-%lukB/%s", + pagesize / 1024, sysfs_file); + return 0; +} + +static int hpage_size_to_index(unsigned long size) +{ + int i; + + for (i = 0; i < nr_hpage_sizes; i++) + if (hpage_sizes[i].pagesize == size) + return i; + return -1; +} + +void probe_default_hpage_size(void) +{ + long size; + int index; + int default_overrided; + + if (nr_hpage_sizes == 0) { + INFO("No configured huge page sizes\n"); + hpage_sizes_default_idx = -1; + return; + } + + /* + * Check if the user specified a default size, otherwise use the + * system default size as reported by /proc/meminfo. + */ + default_overrided = (__hugetlb_opts.def_page_size && + strlen(__hugetlb_opts.def_page_size) > 0); + if (default_overrided) + size = parse_page_size(__hugetlb_opts.def_page_size); + else { + size = kernel_default_hugepage_size(); + } + + if (size >= 0) { + index = hpage_size_to_index(size); + if (index >= 0) + hpage_sizes_default_idx = index; + else { + /* + * If the user specified HUGETLB_DEFAULT_PAGE_SIZE, + * then this situation will alter semantics and they + * should receive a WARNING. Otherwise, this detail + * is purely informational in nature. + */ + char msg[] = "No mount point found for default huge " \ + "page size. Using first available mount " + "point.\n"; + if (default_overrided) + WARNING("%s", msg); + else + INFO("%s", msg); + hpage_sizes_default_idx = 0; + } + } else { + ERROR("Unable to determine default huge page size\n"); + hpage_sizes_default_idx = -1; + } +} + +static void add_hugetlbfs_mount(char *path, int user_mount) +{ + int idx; + long size; + + if (strlen(path) > PATH_MAX) + return; + + if (!hugetlbfs_test_path(path)) { + WARNING("%s is not a hugetlbfs mount point, ignoring\n", path); + return; + } + + size = hugetlbfs_test_pagesize(path); + if (size < 0) { + WARNING("Unable to detect page size for path %s\n", path); + return; + } + + idx = hpage_size_to_index(size); + if (idx < 0) { + if (nr_hpage_sizes >= MAX_HPAGE_SIZES) { + WARNING("Maximum number of huge page sizes exceeded, " + "ignoring %lukB page size\n", size); + return; + } + + idx = nr_hpage_sizes; + hpage_sizes[nr_hpage_sizes++].pagesize = size; + } + + if (strlen(hpage_sizes[idx].mount)) { + if (user_mount) + WARNING("Mount point already defined for size %li, " + "ignoring %s\n", size, path); + return; + } + + strcpy(hpage_sizes[idx].mount, path); +} + +void debug_show_page_sizes(void) +{ + int i; + + INFO("Detected page sizes:\n"); + for (i = 0; i < nr_hpage_sizes; i++) + INFO(" Size: %li kB %s Mount: %s\n", + hpage_sizes[i].pagesize / 1024, + i == hpage_sizes_default_idx ? "(default)" : "", + hpage_sizes[i].mount); +} + +#define LINE_MAXLEN 2048 +static void find_mounts(void) +{ + int fd; + char path[PATH_MAX+1]; + char line[LINE_MAXLEN + 1]; + char *eol; + int bytes, err, dummy; + off_t offset; + + fd = open("/proc/mounts", O_RDONLY); + if (fd < 0) { + fd = open("/etc/mtab", O_RDONLY); + if (fd < 0) { + ERROR("Couldn't open /proc/mounts or /etc/mtab (%s)\n", + strerror(errno)); + return; + } + } + + while ((bytes = read(fd, line, LINE_MAXLEN)) > 0) { + line[LINE_MAXLEN] = '\0'; + eol = strchr(line, '\n'); + if (!eol) { + ERROR("Line too long when parsing mounts\n"); + break; + } + + /* + * Truncate the string to just one line and reset the file + * to begin reading at the start of the next line. + */ + *eol = '\0'; + offset = bytes - (eol + 1 - line); + lseek(fd, -offset, SEEK_CUR); + + /* + * Match only hugetlbfs filesystems. + * Subtle: sscanf returns the number of input items matched + * and assigned. To force sscanf to match the literal + * "hugetlbfs" string we include a 'dummy' input item + * following that string. + */ + err = sscanf(line, "%*s %" stringify(PATH_MAX) "s hugetlbfs " + "%*s %d", path, &dummy); + if ((err == 2) && (hugetlbfs_test_path(path) == 1) && + !(access(path, R_OK | W_OK | X_OK))) + add_hugetlbfs_mount(path, 0); + } + close(fd); +} + +void setup_mounts(void) +{ + int do_scan = 1; + + /* If HUGETLB_PATH is set, only add mounts specified there */ + while (__hugetlb_opts.path) { + char path[PATH_MAX + 1]; + char *next = strchrnul(__hugetlb_opts.path, ':'); + + do_scan = 0; + if (next - __hugetlb_opts.path > PATH_MAX) { + ERROR("Path too long in HUGETLB_PATH -- " + "ignoring environment\n"); + break; + } + + strncpy(path, __hugetlb_opts.path, next - __hugetlb_opts.path); + path[next - __hugetlb_opts.path] = '\0'; + add_hugetlbfs_mount(path, 1); + + /* skip the ':' token */ + __hugetlb_opts.path = *next == '\0' ? NULL : next + 1; + } + + /* Then probe all mounted filesystems */ + if (do_scan) + find_mounts(); +} + +int get_pool_size(long size, struct hpage_pool *pool) +{ + long nr_over = 0; + long nr_used = 0; + long nr_surp = 0; + long nr_resv = 0; + long nr_static = 0; + + long it_used = -1; + long it_surp = -1; + long it_resv = -1; + + /* + * Pick up those values which are basically stable with respect to + * the admin; ie. only changed by them. + * + * nr_over may be negative if this kernel does not support overcommit + * in that case we will consider it always 0 and max will track min + * always. + */ + nr_over = get_huge_page_counter(size, HUGEPAGES_OC); + if (nr_over < 0) + nr_over = 0; + + /* Sample the volatile values until they are stable. */ + while (nr_used != it_used || nr_surp != it_surp || nr_resv != it_resv) { + nr_used = it_used; + nr_surp = it_surp; + nr_resv = it_resv; + + it_used = get_huge_page_counter(size, HUGEPAGES_TOTAL); + it_surp = get_huge_page_counter(size, HUGEPAGES_SURP); + it_resv = get_huge_page_counter(size, HUGEPAGES_RSVD); + } + if (nr_surp < 0) + nr_surp = 0; + if (nr_resv < 0) + nr_resv = 0; + + nr_static = nr_used - nr_surp; + + if (nr_static >= 0) { + DEBUG("pagesize<%ld> min<%ld> max<%ld> " + "in-use<%ld>\n", + size, nr_static, nr_static + nr_over, + nr_used); + pool->pagesize = size; + pool->minimum = nr_static; + pool->maximum = nr_static + nr_over; + pool->size = nr_used; + pool->is_default = 0; + + return 1; + } + + return 0; +} + +int hpool_sizes(struct hpage_pool *pools, int pcnt) +{ + long default_size; + int which = 0; + DIR *dir; + struct dirent *entry; + + default_size = kernel_default_hugepage_size(); + if (default_size >= 0 && which < pcnt) + if (get_pool_size(default_size, &pools[which])) { + pools[which].is_default = 1; + which++; + } + + dir = opendir(SYSFS_HUGEPAGES_DIR); + if (dir) { + while ((entry = readdir(dir))) { + char *name = entry->d_name; + long size; + + DEBUG("parsing<%s>\n", name); + if (strncmp(name, "hugepages-", 10) != 0) + continue; + name += 10; + + size = size_to_smaller_unit(atol(name)); + if (size < 0 || size == default_size) + continue; + + if (get_pool_size(size, &pools[which])) + which++; + } + closedir(dir); + } + + return (which < pcnt) ? which : -1; +} + +/* + * If we have a default page size then we support hugepages. + */ +int kernel_has_hugepages(void) +{ + long default_size = kernel_default_hugepage_size(); + if (default_size < 0) + return 0; + + return 1; +} + +/* + * If we can find the default page size, and if we can find an overcommit + * control for it then the kernel must support overcommit. + */ +int kernel_has_overcommit(void) +{ + long default_size = kernel_default_hugepage_size(); + if (default_size < 0) + return 0; + + if (get_huge_page_counter(default_size, HUGEPAGES_OC) < 0) + return 0; + + return 1; +} + +/********************************************************************/ +/* Library user visible functions */ +/********************************************************************/ + +/* + * NOTE: This function uses data that is initialized by + * setup_mounts() which is called during libhugetlbfs initialization. + * + * returns: + * on success, size of a huge page in number of bytes + * on failure, -1 + * errno set to ENOSYS if huge pages are not supported + * errno set to EOVERFLOW if huge page size would overflow return type + */ +long gethugepagesize(void) +{ + long hpage_size; + + /* Are huge pages available and have they been initialized? */ + if (hpage_sizes_default_idx == -1) { + errno = hugepagesize_errno = ENOSYS; + return -1; + } + + errno = 0; + hpage_size = hpage_sizes[hpage_sizes_default_idx].pagesize; + return hpage_size; +} + +int gethugepagesizes(long pagesizes[], int n_elem) +{ + long default_size; + DIR *sysfs; + struct dirent *ent; + int nr_sizes = 0; + + if (n_elem < 0) { + errno = EINVAL; + return -1; + } + + if (n_elem > 0 && pagesizes == NULL) { + errno = EINVAL; + return -1; + } + + errno = 0; + + /* Get the system default size. */ + default_size = kernel_default_hugepage_size(); + if (default_size < 0) + return 0; + + if (pagesizes && (nr_sizes == n_elem)) + return nr_sizes; + if (pagesizes) + pagesizes[nr_sizes] = default_size; + nr_sizes++; + + /* + * Scan sysfs to look for other sizes. + * Non-existing dir is not an error, we got one size from /proc/meminfo. + */ + sysfs = opendir(SYSFS_HUGEPAGES_DIR); + if (!sysfs) { + if (errno == ENOENT) { + errno = 0; + return nr_sizes; + } else + return -1; + } + while ((ent = readdir(sysfs))) { + long size; + + if (strncmp(ent->d_name, "hugepages-", 10)) + continue; + + size = strtol(ent->d_name + 10, NULL, 10); + if (size == LONG_MIN || size == LONG_MAX) + continue; + size = size_to_smaller_unit(size); + + if (size < 0 || size == default_size) + continue; + if (pagesizes && (nr_sizes == n_elem)) + return nr_sizes; + if (pagesizes) + pagesizes[nr_sizes] = size; + nr_sizes++; + } + closedir(sysfs); + + return nr_sizes; +} + +int getpagesizes(long pagesizes[], int n_elem) +{ + int ret; + + if (n_elem < 0 || (n_elem > 0 && pagesizes == NULL)) { + errno = EINVAL; + return -1; + } + + /* Requests for sizing, we need one more slot than gethugepagesizes. */ + if (pagesizes == NULL && n_elem == 0) { + ret = gethugepagesizes(pagesizes, n_elem); + } else { + /* Install the base page size. */ + if (pagesizes && n_elem == 0) + return 0; + if (pagesizes) + pagesizes[0] = sysconf(_SC_PAGESIZE); + + ret = gethugepagesizes(pagesizes + 1, n_elem - 1); + } + if (ret < 0) + return ret; + return ret + 1; +} + +int hugetlbfs_test_path(const char *mount) +{ + struct statfs64 sb; + int err; + + /* Bugs in the 32<->64 translation code in pre-2.6.15 kernels + * mean that plain statfs() returns bogus errors on hugetlbfs + * filesystems. Use statfs64() to work around. */ + err = statfs64(mount, &sb); + if (err) + return -1; + + return (sb.f_type == HUGETLBFS_MAGIC); +} + +/* Return the page size for the given mount point in bytes */ +long hugetlbfs_test_pagesize(const char *mount) +{ + struct statfs64 sb; + int err; + + err = statfs64(mount, &sb); + if (err) + return -1; + + if ((sb.f_bsize <= 0) || (sb.f_bsize > LONG_MAX)) + return -1; + + return sb.f_bsize; +} + +const char *hugetlbfs_find_path_for_size(long page_size) +{ + char *path; + int idx; + + idx = hpage_size_to_index(page_size); + if (idx >= 0) { + path = hpage_sizes[idx].mount; + if (strlen(path)) + return path; + } + return NULL; +} + +const char *hugetlbfs_find_path(void) +{ + long hpage_size = gethugepagesize(); + if (hpage_size > 0) + return hugetlbfs_find_path_for_size(hpage_size); + else + return NULL; +} + +int hugetlbfs_unlinked_fd_for_size(long page_size) +{ + const char *path; + char name[PATH_MAX+1]; + int fd; + + path = hugetlbfs_find_path_for_size(page_size); + if (!path) + return -1; + + name[sizeof(name)-1] = '\0'; + + strcpy(name, path); + strncat(name, "/libhugetlbfs.tmp.XXXXXX", sizeof(name)-1); + /* FIXME: deal with overflows */ + + fd = mkstemp64(name); + + if (fd < 0) { + ERROR("mkstemp() failed: %s\n", strerror(errno)); + return -1; + } + + unlink(name); + + return fd; +} + +int hugetlbfs_unlinked_fd(void) +{ + long hpage_size = gethugepagesize(); + if (hpage_size > 0) + return hugetlbfs_unlinked_fd_for_size(hpage_size); + else + return -1; +} + +#define IOV_LEN 64 +int hugetlbfs_prefault(void *addr, size_t length) +{ + size_t offset; + struct iovec iov[IOV_LEN]; + int ret; + int i; + int fd; + + if (!__hugetlbfs_prefault) + return 0; + + /* + * The NUMA users of libhugetlbfs' malloc feature are + * expected to use the numactl program to specify an + * appropriate policy for hugepage allocation + * + * Use readv(2) to instantiate the hugepages unless HUGETLB_NO_PREFAULT + * is set. If we instead returned a hugepage mapping with insufficient + * hugepages, the VM system would kill the process when the + * process tried to access the missing memory. + * + * The value of this environment variable is read during library + * initialisation and sets __hugetlbfs_prefault accordingly. If + * prefaulting is enabled and we can't get all that were requested, + * -ENOMEM is returned. The caller is expected to release the entire + * mapping and optionally it may recover by mapping base pages instead. + */ + + fd = open("/dev/zero", O_RDONLY); + if (fd < 0) { + ERROR("Failed to open /dev/zero for reading\n"); + return -ENOMEM; + } + + for (offset = 0; offset < length; ) { + for (i = 0; i < IOV_LEN && offset < length; i++) { + iov[i].iov_base = addr + offset; + iov[i].iov_len = 1; + offset += gethugepagesize(); + } + ret = readv(fd, iov, i); + if (ret != i) { + DEBUG("Got %d of %d requested; err=%d\n", ret, + i, ret < 0 ? errno : 0); + WARNING("Failed to reserve %ld huge pages " + "for new region\n", + length / gethugepagesize()); + close(fd); + return -ENOMEM; + } + } + + close(fd); + return 0; +} + +long get_huge_page_counter(long pagesize, unsigned int counter) +{ + char file[PATH_MAX+1]; + char *key; + + if (select_pool_counter(counter, pagesize, file, &key)) + return -1; + + if (access(file, O_RDONLY)) + return -1; + + return file_read_ulong(file, key); +} + +int set_huge_page_counter(long pagesize, unsigned int counter, + unsigned long val) +{ + char file[PATH_MAX+1]; + + if (select_pool_counter(counter, pagesize, file, NULL)) + return -1; + + return file_write_ulong(file, val); +} + +int set_nr_hugepages(long pagesize, unsigned long val) +{ + return set_huge_page_counter(pagesize, HUGEPAGES_TOTAL, val); +} + +int set_nr_overcommit_hugepages(long pagesize, unsigned long val) +{ + DEBUG("setting HUGEPAGES_OC to %ld\n", val); + return set_huge_page_counter(pagesize, HUGEPAGES_OC, val); +} + +long read_nr_overcommit(long page_size) +{ + if (!kernel_has_overcommit()) + return -1; + + return get_huge_page_counter(page_size, HUGEPAGES_OC); +} + +void restore_overcommit_pages(long page_size, long oc_pool) +{ + if (!kernel_has_overcommit()) + return; + + set_nr_overcommit_hugepages(page_size, oc_pool); +} + +/********************************************************************/ +/* Library user visible DIAGNOSES/DEBUGGING ONLY functions */ +/********************************************************************/ + +#define MAPS_BUF_SZ 4096 +long dump_proc_pid_maps() +{ + FILE *f; + char line[MAPS_BUF_SZ]; + size_t ret; + + f = fopen("/proc/self/maps", "r"); + if (!f) { + ERROR("Failed to open /proc/self/maps\n"); + return -1; + } + + while (1) { + ret = fread(line, sizeof(char), MAPS_BUF_SZ, f); + if (ret < 0) { + ERROR("Failed to read /proc/self/maps\n"); + return -1; + } + if (ret == 0) + break; + ret = fwrite(line, sizeof(char), ret, stderr); + if (ret < 0) { + ERROR("Failed to write /proc/self/maps to stderr\n"); + return -1; + } + } + + fclose(f); + return 0; +} + +long read_meminfo(const char *tag) +{ + return file_read_ulong(MEMINFO, tag); +} diff --git a/default/libhugetlbfs/libhugetlbfs/init.c b/default/libhugetlbfs/libhugetlbfs/init.c new file mode 100644 index 0000000..b912448 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/init.c @@ -0,0 +1,39 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 Nishanth Aravamudan, IBM Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libhugetlbfs_internal.h" + +static void __attribute__ ((constructor)) setup_libhugetlbfs(void) +{ + hugetlbfs_setup_env(); + hugetlbfs_setup_debug(); + hugetlbfs_setup_kernel_page_size(); + setup_mounts(); + probe_default_hpage_size(); + if (__hugetlbfs_debug) + debug_show_page_sizes(); + setup_features(); + hugetlbfs_check_priv_resv(); + hugetlbfs_check_safe_noreserve(); + hugetlbfs_check_map_hugetlb(); +#ifndef NO_ELFLINK + hugetlbfs_setup_elflink(); +#endif + hugetlbfs_setup_morecore(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/init_privutils.c b/default/libhugetlbfs/libhugetlbfs/init_privutils.c new file mode 100644 index 0000000..f32d83b --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/init_privutils.c @@ -0,0 +1,27 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 Nishanth Aravamudan, IBM Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libhugetlbfs_internal.h" + +static void __attribute__ ((constructor)) setup_libhugetlbfs(void) +{ + hugetlbfs_setup_debug(); + setup_mounts(); + setup_features(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/kernel-features.c b/default/libhugetlbfs/libhugetlbfs/kernel-features.c new file mode 100644 index 0000000..b8cdec9 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/kernel-features.c @@ -0,0 +1,271 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#define _GNU_SOURCE /* For strchrnul */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <string.h> +#include <sys/utsname.h> +#include "kernel-features.h" +#include "hugetlbfs.h" +#include "libhugetlbfs_privutils.h" +#include "libhugetlbfs_internal.h" +#include "libhugetlbfs_debug.h" + +static struct kernel_version running_kernel_version; + +/* This mask should always be 32 bits, regardless of the platform word size */ +static unsigned int feature_mask; + +static struct feature kernel_features[] = { + [HUGETLB_FEATURE_PRIVATE_RESV] = { + .name = "private_reservations", + .required_version = "2.6.27-rc1", + }, + [HUGETLB_FEATURE_SAFE_NORESERVE] = { + .name = "noreserve_safe", + .required_version = "2.6.34", + }, + [HUGETLB_FEATURE_MAP_HUGETLB] = { + .name = "map_hugetlb", + .required_version = "2.6.32", + } +}; + +static void debug_kernel_version(void) +{ + struct kernel_version *ver = &running_kernel_version; + + INFO("Parsed kernel version: [%u] . [%u] . [%u] ", + ver->major, ver->minor, ver->release); + if (ver->post) + INFO_CONT(" [post-release: %u]\n", ver->post); + else if (ver->pre) + INFO_CONT(" [pre-release: %u]\n", ver->pre); + else + INFO_CONT("\n"); +} + +static int str_to_ver(const char *str, struct kernel_version *ver) +{ + int err; + int nr_chars; + char extra[4]; + + /* Clear out version struct */ + ver->major = ver->minor = ver->release = ver->post = ver->pre = 0; + + /* The kernel always starts x.y.z */ + err = sscanf(str, "%u.%u.%u%n", &ver->major, &ver->minor, &ver->release, + &nr_chars); + /* + * The sscanf man page says that %n may or may not affect the return + * value so make sure it is at least 3 to cover the three kernel + * version variables and assume nr_chars will be correctly assigned. + */ + if (err < 3) { + ERROR("Unable to determine base kernel version: %s\n", + strerror(errno)); + return -1; + } + + /* Advance the str by the number of characters indicated by sscanf */ + str += nr_chars; + + /* Try to match a post/stable version */ + err = sscanf(str, ".%u", &ver->post); + if (err == 1) + return 0; + + /* Try to match a preN/rcN version */ + err = sscanf(str, "-%3[^0-9]%u", extra, &ver->pre); + if (err != 2 || (strcmp(extra, "pre") != 0 && strcmp(extra, "rc") != 0)) + ver->pre = 0; + + /* + * For now we ignore any extraversions besides pre and post versions + * and treat them as equal to the base version. + */ + return 0; +} + +static int int_cmp(int a, int b) +{ + if (a < b) + return -1; + if (a > b) + return 1; + else + return 0; +} + +/* + * Pre-release kernels have the following compare rules: + * X.Y.(Z - 1) < X.Y.Z-rcN < X.Y.X + * This order can be enforced by simply decrementing the release (for + * comparison purposes) when there is a pre/rc modifier in effect. + */ +static int ver_cmp_release(struct kernel_version *ver) +{ + if (ver->pre) + return ver->release - 1; + else + return ver->release; +} + +static int ver_cmp(struct kernel_version *a, struct kernel_version *b) +{ + int ret, a_release, b_release; + + if ((ret = int_cmp(a->major, b->major)) != 0) + return ret; + + if ((ret = int_cmp(a->minor, b->minor)) != 0) + return ret; + + a_release = ver_cmp_release(a); + b_release = ver_cmp_release(b); + if ((ret = int_cmp(a_release, b_release)) != 0) + return ret; + + if ((ret = int_cmp(a->post, b->post)) != 0) + return ret; + + if ((ret = int_cmp(a->pre, b->pre)) != 0) + return ret; + + /* We ignore forks (such as -mm and -mjb) */ + return 0; +} + +int test_compare_kver(const char *a, const char *b) +{ + struct kernel_version ka, kb; + + if (str_to_ver(a, &ka) < 0) + return -EINVAL; + if (str_to_ver(b, &kb) < 0) + return -EINVAL; + return ver_cmp(&ka, &kb); +} + +int hugetlbfs_test_feature(int feature_code) +{ + if (feature_code >= HUGETLB_FEATURE_NR) { + ERROR("hugetlbfs_test_feature: invalid feature code\n"); + return -EINVAL; + } + return feature_mask & (1 << feature_code); +} + +static void print_valid_features(void) +{ + int i; + + ERROR("HUGETLB_FEATURES=\"<feature>[,<feature>] ...\"\n"); + ERROR_CONT("Valid features:\n"); + for (i = 0; i < HUGETLB_FEATURE_NR; i++) + ERROR_CONT("\t%s, no_%s\n", kernel_features[i].name, + kernel_features[i].name); +} + +static int check_features_env_valid(const char *env) +{ + const char *pos = env; + int i; + + while (pos && *pos != '\0') { + int match = 0; + char *next; + + if (*pos == ',') + pos++; + next = strchrnul(pos, ','); + if (strncmp(pos, "no_", 3) == 0) + pos += 3; + + for (i = 0; i < HUGETLB_FEATURE_NR; i++) { + char *name = kernel_features[i].name; + if (strncmp(pos, name, next - pos) == 0) { + match = 1; + break; + } + } + if (!match) { + print_valid_features(); + return -1; + } + pos = next; + } + return 0; +} + +void setup_features() +{ + struct utsname u; + int i; + + if (uname(&u)) { + ERROR("Getting kernel version failed: %s\n", strerror(errno)); + return; + } + + str_to_ver(u.release, &running_kernel_version); + debug_kernel_version(); + + /* Check if the user has overrided any features */ + if (__hugetlb_opts.features && + check_features_env_valid(__hugetlb_opts.features) == -1) { + ERROR("HUGETLB_FEATURES was invalid -- ignoring.\n"); + __hugetlb_opts.features = NULL; + } + + for (i = 0; i < HUGETLB_FEATURE_NR; i++) { + struct kernel_version ver; + char *name = kernel_features[i].name; + char *pos; + + str_to_ver(kernel_features[i].required_version, &ver); + + /* Has the user overridden feature detection? */ + if (__hugetlb_opts.features && + (pos = strstr(__hugetlb_opts.features, name))) { + INFO("Overriding feature %s: ", name); + /* If feature is preceeded by 'no_' then turn it off */ + if (((pos - 3) >= __hugetlb_opts.features) && + !strncmp(pos - 3, "no_", 3)) + INFO_CONT("no\n"); + else { + INFO_CONT("yes\n"); + feature_mask |= (1UL << i); + } + continue; + } + + /* Is the running kernel version newer? */ + if (ver_cmp(&running_kernel_version, &ver) >= 0) { + INFO("Feature %s is present in this kernel\n", + kernel_features[i].name); + feature_mask |= (1UL << i); + } + } +} diff --git a/default/libhugetlbfs/libhugetlbfs/kernel-features.h b/default/libhugetlbfs/libhugetlbfs/kernel-features.h new file mode 100644 index 0000000..e1b6ca9 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/kernel-features.h @@ -0,0 +1,30 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +struct kernel_version { + unsigned int major; + unsigned int minor; + unsigned int release; + unsigned int post; + unsigned int pre; +}; + +struct feature { + char *name; + char *required_version; +}; diff --git a/default/libhugetlbfs/libhugetlbfs/ld.hugetlbfs b/default/libhugetlbfs/libhugetlbfs/ld.hugetlbfs new file mode 100755 index 0000000..d102a56 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/ld.hugetlbfs @@ -0,0 +1,84 @@ +#! /bin/bash + +# Paranoid check to make sure we don't reinvoke ourselves, effectively +# making a fork()bomb +if [ -n "$LD_HUGETLBFS_RECURSION" ]; then + exit 99 +fi +export LD_HUGETLBFS_RECURSION=1 + +### SET DEFAULT LDSCRIPT PATH HERE ### +if [ -z "$HUGETLB_LDSCRIPT_PATH" ]; then + # Assume this script is running from the libhugetlbfs source tree, + # and look for the ldscripts accordingly + HUGETLB_LDSCRIPT_PATH=$(dirname $(readlink $0))/ldscripts +fi + +# Try to figure out what's the underlying linker to invoke +if [ -z "$LD" ]; then + for x in $(which -a ld); do + if [ "$x" != "$0" ]; then + LD="$x" + break + fi + done +fi + +i=0 +while [ -n "$1" ]; do + arg="$1" + case "$arg" in + -m*) + EMU="${arg#-m}" + args[$i]="$arg" + i=$[i+1] + if [ -z "$EMU" ]; then + shift + EMU="$1" + args[$i]="$1" + i=$[i+1] + fi + ;; + --hugetlbfs-link=*) + if [ -z "$HUGETLB_DEPRECATED_LINK" ]; then + echo -n "ld.hugetlbfs: --hugetlbfs-link is deprecated. " 1>&2 + echo "Migrate to --hugetlbfs-align." 1>&2 + fi + HTLB_LINK="${arg#--hugetlbfs-link=}" + ;; + --hugetlbfs-script-path=*) + HUGETLB_LDSCRIPT_PATH="${arg#--hugetlbfs-script-path=}" + ;; + --hugetlbfs-align) + HTLB_ALIGN="slice" + ;; + --) + args=("${args[@]}" "$@") + break + ;; + *) + args[$i]="$arg" + i=$[i+1] + ;; + esac + shift +done + +if [ -n "$HTLB_LINK" ]; then + HTLB_ALIGN="" # --hugetlbfs-link overrides --hugetlbfs-align + LDSCRIPT="$EMU.x$HTLB_LINK" + HTLBOPTS="-T${HUGETLB_LDSCRIPT_PATH}/${LDSCRIPT}" +fi + +MB=$((1024*1024)) +case "$EMU" in +elf32ppclinux|elf64ppc) HPAGE_SIZE=$((16*$MB)) SLICE_SIZE=$((256*$MB)) ;; +elf_i386|elf_x86_64) HPAGE_SIZE=$((4*$MB)) SLICE_SIZE=$HPAGE_SIZE ;; +esac + +if [ "$HTLB_ALIGN" == "slice" ]; then + HTLBOPTS="-zcommon-page-size=$SLICE_SIZE -zmax-page-size=$SLICE_SIZE" + HTLBOPTS="$HTLBOPTS -lhugetlbfs" +fi + +${LD} "${args[@]}" ${HTLBOPTS} diff --git a/default/libhugetlbfs/libhugetlbfs/ldscripts/elf32ppclinux.xB b/default/libhugetlbfs/libhugetlbfs/ldscripts/elf32ppclinux.xB new file mode 100644 index 0000000..28ad88d --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/ldscripts/elf32ppclinux.xB @@ -0,0 +1,254 @@ +/* Link script for normal executables with BSS in hugepages */ +OUTPUT_FORMAT("elf32-powerpc", "elf32-powerpc", + "elf32-powerpc") +OUTPUT_ARCH(powerpc:common) +ENTRY(_start) +SEARCH_DIR("/usr/powerpc-linux-gnu/lib"); SEARCH_DIR("/usr/local/lib"); SEARCH_DIR("/lib"); SEARCH_DIR("/usr/lib"); +INPUT(-lhugetlbfs); +PHDRS +{ + headers PT_PHDR PHDRS ; + interp PT_INTERP ; + text PT_LOAD FILEHDR PHDRS ; + data PT_LOAD ; + htlb PT_LOAD FLAGS (0x00100007); + dynamic PT_DYNAMIC ; + note PT_NOTE ; + gnu_stack PT_GNU_STACK ; + /* this is the value of PT_GNU_EH_FRAME as defined in + usr/include/elf.h but binutils does not recognize that identifier + as it does other PT_ constants. */ + eh_frame_hdr 1685382480 FLAGS (0x00000004); +} +SECTIONS +{ + /* Read-only sections, merged into text segment: */ + __executable_start = 0x10000000; . = 0x10000000 + SIZEOF_HEADERS; + .interp : { *(.interp) } :text :interp + .note.SuSE : { *(.note.SuSE) } :text :note + .note.ABI-tag : { *(.note.ABI-tag) } :text :note + .note.gnu.build-id : { *(.note.gnu.build-id) } :text :note + .hash : { *(.hash) } :text + .dynsym : { *(.dynsym) } :text + .dynstr : { *(.dynstr) } :text + .gnu.version : { *(.gnu.version) } :text + .gnu.version_d : { *(.gnu.version_d) } :text + .gnu.version_r : { *(.gnu.version_r) } :text + .rel.init : { *(.rel.init) } :text + .rela.init : { *(.rela.init) } :text + .rel.text : { *(.rel.text .rel.text.* .rel.gnu.linkonce.t.*) } :text + .rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) } :text + .rel.fini : { *(.rel.fini) } :text + .rela.fini : { *(.rela.fini) } :text + .rel.rodata : { *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*) } :text + .rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) } :text + .rel.data.rel.ro : { *(.rel.data.rel.ro*) } :text + .rela.data.rel.ro : { *(.rel.data.rel.ro*) } :text + .rel.data : { *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*) } :text + .rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) } :text + .rel.tdata : { *(.rel.tdata .rel.tdata.* .rel.gnu.linkonce.td.*) } :text + .rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) } :text + .rel.tbss : { *(.rel.tbss .rel.tbss.* .rel.gnu.linkonce.tb.*) } :text + .rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) } :text + .rel.ctors : { *(.rel.ctors) } :text + .rela.ctors : { *(.rela.ctors) } :text + .rel.dtors : { *(.rel.dtors) } :text + .rela.dtors : { *(.rela.dtors) } :text + .rel.got : { *(.rel.got) } :text + .rela.got : { *(.rela.got) } :text + .rela.got1 : { *(.rela.got1) } :text + .rela.got2 : { *(.rela.got2) } :text + .rel.sdata : { *(.rel.sdata .rel.sdata.* .rel.gnu.linkonce.s.*) } :text + .rela.sdata : { *(.rela.sdata .rela.sdata.* .rela.gnu.linkonce.s.*) } :text + .rel.sbss : { *(.rel.sbss .rel.sbss.* .rel.gnu.linkonce.sb.*) } :text + .rela.sbss : { *(.rela.sbss .rela.sbss.* .rela.gnu.linkonce.sb.*) } :text + .rel.sdata2 : { *(.rel.sdata2 .rel.sdata2.* .rel.gnu.linkonce.s2.*) } :text + .rela.sdata2 : { *(.rela.sdata2 .rela.sdata2.* .rela.gnu.linkonce.s2.*) } :text + .rel.sbss2 : { *(.rel.sbss2 .rel.sbss2.* .rel.gnu.linkonce.sb2.*) } :text + .rela.sbss2 : { *(.rela.sbss2 .rela.sbss2.* .rela.gnu.linkonce.sb2.*) } :text + .rel.bss : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) } :text + .rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) } :text + .rel.plt : { *(.rel.plt) } :text + .rela.plt : { *(.rela.plt) } :text + .init : + { + KEEP (*(.init)) + } :text =0 + .text : + { + *(.text .stub .text.* .gnu.linkonce.t.*) + KEEP (*(.text.*personality*)) + /* .gnu.warning sections are handled specially by elf32.em. */ + *(.gnu.warning) + *(.glink) + } :text =0 + .fini : + { + KEEP (*(.fini)) + } :text =0 + PROVIDE (__etext = .); + PROVIDE (_etext = .); + PROVIDE (etext = .); + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } :text + .rodata1 : { *(.rodata1) } :text + .sdata2 : + { + PROVIDE (_SDA2_BASE_ = 32768); + *(.sdata2 .sdata2.* .gnu.linkonce.s2.*) + } :text + .sbss2 : { *(.sbss2 .sbss2.* .gnu.linkonce.sb2.*) } :text + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr +/* .eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) } :text */ +/* .gcc_except_table : ONLY_IF_RO { *(.gcc_except_table .gcc_except_table.*) } :text */ + /* Adjust the address for the data segment. We want to adjust up to + the same address within the page on the next page up. */ + . = ALIGN (0x10000) - ((0x10000 - .) & (0x10000 - 1)); . = DATA_SEGMENT_ALIGN (0x10000, 0x1000); + /* Exception handling */ + .eh_frame : /*ONLY_IF_RW*/ { KEEP (*(.eh_frame)) } :data + .gcc_except_table : /*ONLY_IF_RW*/ { *(.gcc_except_table .gcc_except_table.*) } :data + /* Thread Local Storage sections */ + .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } :data + .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } :data + .preinit_array : + { + PROVIDE /*_HIDDEN*/ (__preinit_array_start = .); + KEEP (*(.preinit_array)) + PROVIDE /*_HIDDEN*/ (__preinit_array_end = .); + } :data + .init_array : + { + PROVIDE /*_HIDDEN*/ (__init_array_start = .); + KEEP (*(SORT(.init_array.*))) + KEEP (*(.init_array)) + PROVIDE /*_HIDDEN*/ (__init_array_end = .); + } :data + .fini_array : + { + PROVIDE /*_HIDDEN*/ (__fini_array_start = .); + KEEP (*(.fini_array)) + KEEP (*(SORT(.fini_array.*))) + PROVIDE /*_HIDDEN*/ (__fini_array_end = .); + } :data + .ctors : + { + /* gcc uses crtbegin.o to find the start of + the constructors, so we make sure it is + first. Because this is a wildcard, it + doesn't matter if the user does not + actually link against crtbegin.o; the + linker won't look for a file to match a + wildcard. The wildcard also means that it + doesn't matter which directory crtbegin.o + is in. */ + KEEP (*crtbegin*.o(.ctors)) + /* We don't want to include the .ctor section from + from the crtend.o file until after the sorted ctors. + The .ctor section from the crtend file contains the + end of ctors marker and it must be last */ + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + } :data + .dtors : + { + KEEP (*crtbegin*.o(.dtors)) + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + } :data + .jcr : { KEEP (*(.jcr)) } :data + .data.rel.ro : { *(.data.rel.ro.local) *(.data.rel.ro*) } :data + .got1 : { *(.got1) } :data + .got2 : { *(.got2) } :data + .dynamic : { *(.dynamic) } :dynamic :data +/* .got : SPECIAL { *(.got) } :data*/ +/* . = DATA_SEGMENT_RELRO_END (0, .);*/ +/* .plt : SPECIAL { *(.plt) } :data*/ + .data : + { + *(.data .data.* .gnu.linkonce.d.*) + KEEP (*(.gnu.linkonce.d.*personality*)) + SORT(CONSTRUCTORS) + } :data + .data1 : { *(.data1) } :data + .got : /*SPECIAL*/ { *(.got) } :data + /* We want the small data sections together, so single-instruction offsets + can access them all, and initialized data all before uninitialized, so + we can shorten the on-disk segment size. */ + .sdata : + { + PROVIDE (_SDA_BASE_ = 32768); + *(.sdata .sdata.* .gnu.linkonce.s.*) + } :data + _edata = .; PROVIDE (edata = .); + .plt : /*SPECIAL*/ { *(.plt) } :data + . = ALIGN(32 / 8); + . = ALIGN(32 / 8); + . = DATA_SEGMENT_END (.); + /* Hugepage area */ + /* Saving hugepages is more important than saving executable size, so + * we don't attempt to maintain congruence here */ + . = ALIGN(0x10000000); /* Align to next 256MB segment */ + /* HACK: workaround fact that kernel may not cope with segments with zero + * filesize */ + .hugetlb.data : { LONG(1) } :htlb + __bss_start = .; + .sbss : + { + PROVIDE (__sbss_start = .); PROVIDE (___sbss_start = .); + *(.dynsbss) + *(.sbss .sbss.* .gnu.linkonce.sb.*) + *(.scommon) + PROVIDE (__sbss_end = .); PROVIDE (___sbss_end = .); + } :htlb + .bss : + { + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(COMMON) + /* + * Align here to ensure that the .bss section occupies space up to + * _end. Additionally (for huge pages) align to a segment boundary. + * This ensures that no normal page mappings will be created in this + * segment (after the bss) which could interfere with remapping. + */ + . = ALIGN(256*1024*1024); + } :htlb + _end = .; + PROVIDE (end = .); + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /DISCARD/ : { *(.fixup) } + /DISCARD/ : { *(.note.GNU-stack) } +} diff --git a/default/libhugetlbfs/libhugetlbfs/ldscripts/elf32ppclinux.xBDT b/default/libhugetlbfs/libhugetlbfs/ldscripts/elf32ppclinux.xBDT new file mode 100644 index 0000000..497882b --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/ldscripts/elf32ppclinux.xBDT @@ -0,0 +1,245 @@ +/* Linker script for normal executables with text data and BSS in hugepages */ +OUTPUT_FORMAT("elf32-powerpc", "elf32-powerpc", + "elf32-powerpc") +OUTPUT_ARCH(powerpc:common) +ENTRY(_start) +SEARCH_DIR("/usr/powerpc-linux-gnu/lib"); SEARCH_DIR("/usr/local/lib"); SEARCH_DIR("/lib"); SEARCH_DIR("/usr/lib"); +INPUT(-lhugetlbfs); +PHDRS +{ + headers PT_PHDR PHDRS ; + interp PT_INTERP ; + htext PT_LOAD FILEHDR PHDRS FLAGS (0x00100005); + hdata PT_LOAD FLAGS (0x00100007); + dynamic PT_DYNAMIC ; + note PT_NOTE ; + gnu_stack PT_GNU_STACK ; + /* this is the value of PT_GNU_EH_FRAME as defined in + usr/include/elf.h but binutils does not recognize that identifier + as it does other PT_ constants. */ + eh_frame_hdr 1685382480 FLAGS (0x00000004); +} +SECTIONS +{ + /* Read-only sections, merged into text segment: */ + __executable_start = 0x10000000; . = 0x10000000 + SIZEOF_HEADERS; + .interp : { *(.interp) } :htext :interp + .note.SuSE : { *(.note.SuSE) } :htext :note + .note.ABI-tag : { *(.note.ABI-tag) } :htext :note + .note.gnu.build-id : { *(.note.gnu.build-id) } :htext :note + .hash : { *(.hash) } :htext + .dynsym : { *(.dynsym) } :htext + .dynstr : { *(.dynstr) } :htext + .gnu.version : { *(.gnu.version) } :htext + .gnu.version_d : { *(.gnu.version_d) } :htext + .gnu.version_r : { *(.gnu.version_r) } :htext + .rel.init : { *(.rel.init) } :htext + .rela.init : { *(.rela.init) } :htext + .rel.text : { *(.rel.text .rel.text.* .rel.gnu.linkonce.t.*) } :htext + .rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) } :htext + .rel.fini : { *(.rel.fini) } :htext + .rela.fini : { *(.rela.fini) } :htext + .rel.rodata : { *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*) } :htext + .rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) } :htext + .rel.data.rel.ro : { *(.rel.data.rel.ro*) } :htext + .rela.data.rel.ro : { *(.rel.data.rel.ro*) } :htext + .rel.data : { *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*) } :htext + .rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) } :htext + .rel.tdata : { *(.rel.tdata .rel.tdata.* .rel.gnu.linkonce.td.*) } :htext + .rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) } :htext + .rel.tbss : { *(.rel.tbss .rel.tbss.* .rel.gnu.linkonce.tb.*) } :htext + .rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) } :htext + .rel.ctors : { *(.rel.ctors) } :htext + .rela.ctors : { *(.rela.ctors) } :htext + .rel.dtors : { *(.rel.dtors) } :htext + .rela.dtors : { *(.rela.dtors) } :htext + .rel.got : { *(.rel.got) } :htext + .rela.got : { *(.rela.got) } :htext + .rela.got1 : { *(.rela.got1) } :htext + .rela.got2 : { *(.rela.got2) } :htext + .rel.sdata : { *(.rel.sdata .rel.sdata.* .rel.gnu.linkonce.s.*) } :htext + .rela.sdata : { *(.rela.sdata .rela.sdata.* .rela.gnu.linkonce.s.*) } :htext + .rel.sbss : { *(.rel.sbss .rel.sbss.* .rel.gnu.linkonce.sb.*) } :htext + .rela.sbss : { *(.rela.sbss .rela.sbss.* .rela.gnu.linkonce.sb.*) } :htext + .rel.sdata2 : { *(.rel.sdata2 .rel.sdata2.* .rel.gnu.linkonce.s2.*) } :htext + .rela.sdata2 : { *(.rela.sdata2 .rela.sdata2.* .rela.gnu.linkonce.s2.*) } :htext + .rel.sbss2 : { *(.rel.sbss2 .rel.sbss2.* .rel.gnu.linkonce.sb2.*) } :htext + .rela.sbss2 : { *(.rela.sbss2 .rela.sbss2.* .rela.gnu.linkonce.sb2.*) } :htext + .rel.bss : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) } :htext + .rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) } :htext + .rel.plt : { *(.rel.plt) } :htext + .rela.plt : { *(.rela.plt) } :htext + .init : + { + KEEP (*(.init)) + } :htext =0 + .text : + { + *(.text .stub .text.* .gnu.linkonce.t.*) + KEEP (*(.text.*personality*)) + /* .gnu.warning sections are handled specially by elf32.em. */ + *(.gnu.warning) + *(.glink) + } :htext =0 + .fini : + { + KEEP (*(.fini)) + } :htext =0 + PROVIDE (__etext = .); + PROVIDE (_etext = .); + PROVIDE (etext = .); + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } :htext + .rodata1 : { *(.rodata1) } :htext + .sdata2 : + { + PROVIDE (_SDA2_BASE_ = 32768); + *(.sdata2 .sdata2.* .gnu.linkonce.s2.*) + } :htext + .sbss2 : { *(.sbss2 .sbss2.* .gnu.linkonce.sb2.*) } :htext + .eh_frame_hdr : { *(.eh_frame_hdr) } :htext :eh_frame_hdr +/* .eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) } :htext */ +/* .gcc_except_table : ONLY_IF_RO { *(.gcc_except_table .gcc_except_table.*) } :htext */ + + /* We don't maintain address congruence here, because saving + * hugepages is more important than saving executable size. */ + /* Just move to the very next hugepage, rather than using a guard + * page, because for ppc32 binaries we can't separate the text and + * PLT by >32MB */ + . = ALIGN (0x1000000); + /* Exception handling */ + .eh_frame : /*ONLY_IF_RW*/ { KEEP (*(.eh_frame)) } :hdata + .gcc_except_table : /*ONLY_IF_RW*/ { *(.gcc_except_table .gcc_except_table.*) } :hdata + /* Thread Local Storage sections */ + .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } :hdata + .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } :hdata + .preinit_array : + { + PROVIDE /*_HIDDEN*/ (__preinit_array_start = .); + KEEP (*(.preinit_array)) + PROVIDE /*_HIDDEN*/ (__preinit_array_end = .); + } :hdata + .init_array : + { + PROVIDE /*_HIDDEN*/ (__init_array_start = .); + KEEP (*(SORT(.init_array.*))) + KEEP (*(.init_array)) + PROVIDE /*_HIDDEN*/ (__init_array_end = .); + } :hdata + .fini_array : + { + PROVIDE /*_HIDDEN*/ (__fini_array_start = .); + KEEP (*(.fini_array)) + KEEP (*(SORT(.fini_array.*))) + PROVIDE /*_HIDDEN*/ (__fini_array_end = .); + } :hdata + .ctors : + { + /* gcc uses crtbegin.o to find the start of + the constructors, so we make sure it is + first. Because this is a wildcard, it + doesn't matter if the user does not + actually link against crtbegin.o; the + linker won't look for a file to match a + wildcard. The wildcard also means that it + doesn't matter which directory crtbegin.o + is in. */ + KEEP (*crtbegin*.o(.ctors)) + /* We don't want to include the .ctor section from + from the crtend.o file until after the sorted ctors. + The .ctor section from the crtend file contains the + end of ctors marker and it must be last */ + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + } :hdata + .dtors : + { + KEEP (*crtbegin*.o(.dtors)) + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + } :hdata + .jcr : { KEEP (*(.jcr)) } :hdata + .data.rel.ro : { *(.data.rel.ro.local) *(.data.rel.ro*) } :hdata + .got1 : { *(.got1) } :hdata + .got2 : { *(.got2) } :hdata + .dynamic : { *(.dynamic) } :dynamic :hdata + .got : { *(.got.plt .got) } :hdata +/* . = DATA_SEGMENT_RELRO_END (0, .); */ + .data : + { + *(.data .data.* .gnu.linkonce.d.*) + KEEP (*(.gnu.linkonce.d.*personality*)) + SORT(CONSTRUCTORS) + } :hdata + .data1 : { *(.data1) } :hdata + /* We want the small data sections together, so single-instruction offsets + can access them all, and initialized data all before uninitialized, so + we can shorten the on-disk segment size. */ + .sdata : + { + PROVIDE (_SDA_BASE_ = 32768); + *(.sdata .sdata.* .gnu.linkonce.s.*) + } :hdata + _edata = .; PROVIDE (edata = .); + __bss_start = .; + .sbss : + { + PROVIDE (__sbss_start = .); PROVIDE (___sbss_start = .); + *(.dynsbss) + *(.sbss .sbss.* .gnu.linkonce.sb.*) + *(.scommon) + PROVIDE (__sbss_end = .); PROVIDE (___sbss_end = .); + } :hdata + .plt : { *(.plt) } :hdata + .bss : + { + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(COMMON) + /* + * Align here to ensure that the .bss section occupies space up to + * _end. Additionally (for huge pages) align to a segment boundary. + * This ensures that no normal page mappings will be created in this + * segment (after the bss) which could interfere with remapping. + */ + . = ALIGN(256*1024*1024); + } :hdata + _end = .; + PROVIDE (end = .); + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /DISCARD/ : { *(.fixup) } + /DISCARD/ : { *(.note.GNU-stack) } +} diff --git a/default/libhugetlbfs/libhugetlbfs/ldscripts/elf64ppc.xB b/default/libhugetlbfs/libhugetlbfs/ldscripts/elf64ppc.xB new file mode 100644 index 0000000..1a9c1ab --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/ldscripts/elf64ppc.xB @@ -0,0 +1,245 @@ +/* Linker script for normal executables with BSS in hugepages */ +OUTPUT_FORMAT("elf64-powerpc", "elf64-powerpc", + "elf64-powerpc") +OUTPUT_ARCH(powerpc:common64) +ENTRY(_start) +SEARCH_DIR("/usr/powerpc64-linux-gnu/lib64"); SEARCH_DIR("/usr/local/lib64"); SEARCH_DIR("/lib64"); SEARCH_DIR("/usr/lib64"); SEARCH_DIR("/usr/powerpc64-linux-gnu/lib"); SEARCH_DIR("/usr/local/lib"); SEARCH_DIR("/lib"); SEARCH_DIR("/usr/lib"); +INPUT(-lhugetlbfs); +PHDRS +{ + headers PT_PHDR PHDRS ; + interp PT_INTERP ; + text PT_LOAD FILEHDR PHDRS ; + data PT_LOAD ; + htlb PT_LOAD FLAGS (0x00100007); + dynamic PT_DYNAMIC ; + note PT_NOTE ; + /* this is the value of PT_GNU_EH_FRAME as defined in + usr/include/elf.h but binutils does not recognize that identifier + as it does other PT_ constants. */ + eh_frame_hdr 1685382480 FLAGS (0x00000004); +} +SECTIONS +{ + /* Read-only sections, merged into text segment: */ + __executable_start = 0x10000000; . = 0x10000000 + SIZEOF_HEADERS; + .interp : { *(.interp) } :text :interp + .note.SuSE : { *(.note.SuSE) } :text :note + .note.ABI-tag : { *(.note.ABI-tag) } :text :note + .note.gnu.build-id : { *(.note.gnu.build-id) } :text :note + .hash : { *(.hash) } :text + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + .rel.init : { *(.rel.init) } + .rela.init : { *(.rela.init) } + .rel.text : { *(.rel.text .rel.text.* .rel.gnu.linkonce.t.*) } + .rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) } + .rel.fini : { *(.rel.fini) } + .rela.fini : { *(.rela.fini) } + .rel.rodata : { *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*) } + .rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) } + .rel.data.rel.ro : { *(.rel.data.rel.ro*) } + .rela.data.rel.ro : { *(.rel.data.rel.ro*) } + .rel.data : { *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*) } + .rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) } + .rel.tdata : { *(.rel.tdata .rel.tdata.* .rel.gnu.linkonce.td.*) } + .rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) } + .rel.tbss : { *(.rel.tbss .rel.tbss.* .rel.gnu.linkonce.tb.*) } + .rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) } + .rel.ctors : { *(.rel.ctors) } + .rela.ctors : { *(.rela.ctors) } + .rel.dtors : { *(.rel.dtors) } + .rela.dtors : { *(.rela.dtors) } + .rel.got : { *(.rel.got) } + .rela.got : { *(.rela.got) } + .rela.toc : { *(.rela.toc) } + .rel.sdata : { *(.rel.sdata .rel.sdata.* .rel.gnu.linkonce.s.*) } + .rela.sdata : { *(.rela.sdata .rela.sdata.* .rela.gnu.linkonce.s.*) } + .rel.sbss : { *(.rel.sbss .rel.sbss.* .rel.gnu.linkonce.sb.*) } + .rela.sbss : { *(.rela.sbss .rela.sbss.* .rela.gnu.linkonce.sb.*) } + .rel.sdata2 : { *(.rel.sdata2 .rel.sdata2.* .rel.gnu.linkonce.s2.*) } + .rela.sdata2 : { *(.rela.sdata2 .rela.sdata2.* .rela.gnu.linkonce.s2.*) } + .rel.sbss2 : { *(.rel.sbss2 .rel.sbss2.* .rel.gnu.linkonce.sb2.*) } + .rela.sbss2 : { *(.rela.sbss2 .rela.sbss2.* .rela.gnu.linkonce.sb2.*) } + .rel.bss : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) } + .rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) } + .rel.plt : { *(.rel.plt) } + .rela.plt : { *(.rela.plt) } + .rela.tocbss : { *(.rela.tocbss) } + .init : + { + KEEP (*(.init)) + } =0x60000000 + .text : + { + *(.text .stub .text.* .gnu.linkonce.t.*) + KEEP (*(.text.*personality*)) + /* .gnu.warning sections are handled specially by elf32.em. */ + *(.gnu.warning) + *(.sfpr .glink) + } =0x60000000 + .fini : + { + KEEP (*(.fini)) + } =0x60000000 + PROVIDE (__etext = .); + PROVIDE (_etext = .); + PROVIDE (etext = .); + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } + .rodata1 : { *(.rodata1) } + .sdata2 : + { + *(.sdata2 .sdata2.* .gnu.linkonce.s2.*) + } + .sbss2 : { *(.sbss2 .sbss2.* .gnu.linkonce.sb2.*) } + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr +/* .eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) }*/ +/* .gcc_except_table : ONLY_IF_RO { *(.gcc_except_table .gcc_except_table.*) }*/ + /* Adjust the address for the data segment. We want to adjust up to + the same address within the page on the next page up. */ + . = ALIGN (0x10000) - ((0x10000 - .) & (0x10000 - 1)); . = DATA_SEGMENT_ALIGN (0x10000, 0x1000); + /* Exception handling */ + .eh_frame : /*ONLY_IF_RW*/ { KEEP (*(.eh_frame)) } :data + .gcc_except_table : /*ONLY_IF_RW*/ { *(.gcc_except_table .gcc_except_table.*) } + /* Thread Local Storage sections */ + .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } + .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } + .preinit_array : + { + PROVIDE /*_HIDDEN*/ (__preinit_array_start = .); + KEEP (*(.preinit_array)) + PROVIDE /*_HIDDEN*/ (__preinit_array_end = .); + } + .init_array : + { + PROVIDE /*_HIDDEN*/ (__init_array_start = .); + KEEP (*(SORT(.init_array.*))) + KEEP (*(.init_array)) + PROVIDE /*_HIDDEN*/ (__init_array_end = .); + } + .fini_array : + { + PROVIDE /*_HIDDEN*/ (__fini_array_start = .); + KEEP (*(.fini_array)) + KEEP (*(SORT(.fini_array.*))) + PROVIDE /*_HIDDEN*/ (__fini_array_end = .); + } + .ctors : + { + /* gcc uses crtbegin.o to find the start of + the constructors, so we make sure it is + first. Because this is a wildcard, it + doesn't matter if the user does not + actually link against crtbegin.o; the + linker won't look for a file to match a + wildcard. The wildcard also means that it + doesn't matter which directory crtbegin.o + is in. */ + KEEP (*crtbegin*.o(.ctors)) + /* We don't want to include the .ctor section from + from the crtend.o file until after the sorted ctors. + The .ctor section from the crtend file contains the + end of ctors marker and it must be last */ + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + } + .dtors : + { + KEEP (*crtbegin*.o(.dtors)) + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + } + .jcr : { KEEP (*(.jcr)) } + .data.rel.ro : { *(.data.rel.ro.local) *(.data.rel.ro*) } + .dynamic : { *(.dynamic) } :dynamic :data +/* . = DATA_SEGMENT_RELRO_END (0, .);*/ + .data : + { + *(.data .data.* .gnu.linkonce.d.*) + KEEP (*(.gnu.linkonce.d.*personality*)) + SORT(CONSTRUCTORS) + } + .data1 : { *(.data1) } + .toc1 ALIGN(8) : { *(.toc1) } + .opd ALIGN(8) : { KEEP (*(.opd)) } + .got ALIGN(8) : { *(.got .toc) } + /* We want the small data sections together, so single-instruction offsets + can access them all, and initialized data all before uninitialized, so + we can shorten the on-disk segment size. */ + .sdata : + { + *(.sdata .sdata.* .gnu.linkonce.s.*) + } + _edata = .; PROVIDE (edata = .); + .plt : { *(.plt) } + . = ALIGN(64 / 8); + . = ALIGN(64 / 8); + . = DATA_SEGMENT_END (.); + /* Hugepage area: + * Saving hugepages is more important than saving executable size, so + * we don't attempt to maintain congruence here. + * In order to map hugepages into the address space, we must advance the + * location counter to a segment boundary. If the address is < 4G, the + * next segment will be on a 256M boundary. For higher areas, we have a + * 1TB granularity. */ + . = (. < 0x100000000) ? ALIGN(0x10000000) : ALIGN(0x10000000000); + /* HACK: workaround fact that kernel may not cope with segments with zero + * filesize */ + .hugetlb.data : { LONG(1) } :htlb + __bss_start = .; + .tocbss ALIGN(8) : { *(.tocbss)} :htlb + .sbss : + { + + *(.dynsbss) + *(.sbss .sbss.* .gnu.linkonce.sb.*) + *(.scommon) + } + .bss : + { + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(COMMON) + } + _end = .; + PROVIDE (end = .); + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /DISCARD/ : { *(.note.GNU-stack) } +} diff --git a/default/libhugetlbfs/libhugetlbfs/ldscripts/elf64ppc.xBDT b/default/libhugetlbfs/libhugetlbfs/ldscripts/elf64ppc.xBDT new file mode 100644 index 0000000..5477294 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/ldscripts/elf64ppc.xBDT @@ -0,0 +1,241 @@ +/* Linker script for normal executables with text, data and BSS in hugepages */ +OUTPUT_FORMAT("elf64-powerpc", "elf64-powerpc", + "elf64-powerpc") +OUTPUT_ARCH(powerpc:common64) +ENTRY(_start) +SEARCH_DIR("/usr/powerpc64-linux-gnu/lib64"); SEARCH_DIR("/usr/local/lib64"); SEARCH_DIR("/lib64"); SEARCH_DIR("/usr/lib64"); SEARCH_DIR("/usr/powerpc64-linux-gnu/lib"); SEARCH_DIR("/usr/local/lib"); SEARCH_DIR("/lib"); SEARCH_DIR("/usr/lib"); +INPUT( -lhugetlbfs ); +PHDRS +{ + headers PT_PHDR PHDRS ; + interp PT_INTERP ; + htext PT_LOAD FILEHDR PHDRS FLAGS (0x00100005); + hdata PT_LOAD FLAGS (0x00100007); + dynamic PT_DYNAMIC ; + note PT_NOTE ; + /* this is the value of PT_GNU_EH_FRAME as defined in + usr/include/elf.h but binutils does not recognize that identifier + as it does other PT_ constants. */ + eh_frame_hdr 1685382480 FLAGS (0x00000004); +} +SECTIONS +{ + /* Read-only sections, merged into text segment: */ + __executable_start = 0x10000000; . = 0x10000000 + SIZEOF_HEADERS; + .interp : { *(.interp) } :interp :htext + .note.SuSE : { *(.note.SuSE) } :htext :note + .note.ABI-tag : { *(.note.ABI-tag) } :htext :note + .note.gnu.build-id : { *(.note.gnu.build-id) } :htext :note + .hash : { *(.hash) } :htext + .dynsym : { *(.dynsym) } :htext + .dynstr : { *(.dynstr) } :htext + .gnu.version : { *(.gnu.version) } :htext + .gnu.version_d : { *(.gnu.version_d) } :htext + .gnu.version_r : { *(.gnu.version_r) } :htext + .rel.init : { *(.rel.init) } :htext + .rela.init : { *(.rela.init) } :htext + .rel.text : { *(.rel.text .rel.text.* .rel.gnu.linkonce.t.*) } :htext + .rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) } :htext + .rel.fini : { *(.rel.fini) } :htext + .rela.fini : { *(.rela.fini) } :htext + .rel.rodata : { *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*) } :htext + .rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) } :htext + .rel.data.rel.ro : { *(.rel.data.rel.ro*) } :htext + .rela.data.rel.ro : { *(.rel.data.rel.ro*) } :htext + .rel.data : { *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*) } :htext + .rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) } :htext + .rel.tdata : { *(.rel.tdata .rel.tdata.* .rel.gnu.linkonce.td.*) } :htext + .rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) } :htext + .rel.tbss : { *(.rel.tbss .rel.tbss.* .rel.gnu.linkonce.tb.*) } :htext + .rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) } :htext + .rel.ctors : { *(.rel.ctors) } :htext + .rela.ctors : { *(.rela.ctors) } :htext + .rel.dtors : { *(.rel.dtors) } :htext + .rela.dtors : { *(.rela.dtors) } :htext + .rel.got : { *(.rel.got) } :htext + .rela.got : { *(.rela.got) } :htext + .rela.toc : { *(.rela.toc) } :htext + .rel.sdata : { *(.rel.sdata .rel.sdata.* .rel.gnu.linkonce.s.*) } :htext + .rela.sdata : { *(.rela.sdata .rela.sdata.* .rela.gnu.linkonce.s.*) } :htext + .rel.sbss : { *(.rel.sbss .rel.sbss.* .rel.gnu.linkonce.sb.*) } :htext + .rela.sbss : { *(.rela.sbss .rela.sbss.* .rela.gnu.linkonce.sb.*) } :htext + .rel.sdata2 : { *(.rel.sdata2 .rel.sdata2.* .rel.gnu.linkonce.s2.*) } :htext + .rela.sdata2 : { *(.rela.sdata2 .rela.sdata2.* .rela.gnu.linkonce.s2.*) } :htext + .rel.sbss2 : { *(.rel.sbss2 .rel.sbss2.* .rel.gnu.linkonce.sb2.*) } :htext + .rela.sbss2 : { *(.rela.sbss2 .rela.sbss2.* .rela.gnu.linkonce.sb2.*) } :htext + .rel.bss : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) } :htext + .rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) } :htext + .rel.plt : { *(.rel.plt) } :htext + .rela.plt : { *(.rela.plt) } :htext + .rela.tocbss : { *(.rela.tocbss) } :htext + .init : + { + KEEP (*(.init)) + } :htext =0x60000000 + .text : + { + *(.text .stub .text.* .gnu.linkonce.t.*) + KEEP (*(.text.*personality*)) + /* .gnu.warning sections are handled specially by elf32.em. */ + *(.gnu.warning) + *(.sfpr .glink) + } :htext =0x60000000 + .fini : + { + KEEP (*(.fini)) + } :htext =0x60000000 + PROVIDE (__etext = .); + PROVIDE (_etext = .); + PROVIDE (etext = .); + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } :htext + .rodata1 : { *(.rodata1) } :htext + .sdata2 : + { + *(.sdata2 .sdata2.* .gnu.linkonce.s2.*) + } :htext + .sbss2 : { *(.sbss2 .sbss2.* .gnu.linkonce.sb2.*) } :htext + .eh_frame_hdr : { *(.eh_frame_hdr) } :htext :eh_frame_hdr +/* .eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) } :htext */ +/* .gcc_except_table : ONLY_IF_RO { *(.gcc_except_table .gcc_except_table.*) } :htext */ + + /* We don't maintain address congruence here, because saving + * hugepages is more important than saving executable size. */ + . = ALIGN (. + 0x1000000, 0x1000000); /* Align to next 16MB hugepage */ + /* Exception handling */ + .eh_frame : /*ONLY_IF_RW*/ { KEEP (*(.eh_frame)) } :hdata + .gcc_except_table : /*ONLY_IF_RW*/ { *(.gcc_except_table .gcc_except_table.*) } :hdata + /* Thread Local Storage sections */ + .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } :hdata + .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } :hdata + .preinit_array : + { + PROVIDE /*_HIDDEN*/ (__preinit_array_start = .); + KEEP (*(.preinit_array)) + PROVIDE /*_HIDDEN*/ (__preinit_array_end = .); + } :hdata + .init_array : + { + PROVIDE /*_HIDDEN*/ (__init_array_start = .); + KEEP (*(SORT(.init_array.*))) + KEEP (*(.init_array)) + PROVIDE /*_HIDDEN*/ (__init_array_end = .); + } :hdata + .fini_array : + { + PROVIDE /*_HIDDEN*/ (__fini_array_start = .); + KEEP (*(.fini_array)) + KEEP (*(SORT(.fini_array.*))) + PROVIDE /*_HIDDEN*/ (__fini_array_end = .); + } :hdata + .ctors : + { + /* gcc uses crtbegin.o to find the start of + the constructors, so we make sure it is + first. Because this is a wildcard, it + doesn't matter if the user does not + actually link against crtbegin.o; the + linker won't look for a file to match a + wildcard. The wildcard also means that it + doesn't matter which directory crtbegin.o + is in. */ + KEEP (*crtbegin*.o(.ctors)) + /* We don't want to include the .ctor section from + from the crtend.o file until after the sorted ctors. + The .ctor section from the crtend file contains the + end of ctors marker and it must be last */ + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + } :hdata + .dtors : + { + KEEP (*crtbegin*.o(.dtors)) + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + } :hdata + .jcr : { KEEP (*(.jcr)) } :hdata + .data.rel.ro : { *(.data.rel.ro.local) *(.data.rel.ro*) } :hdata + .dynamic : { *(.dynamic) } :dynamic :hdata +/* . = DATA_SEGMENT_RELRO_END (0, .);*/ + .data : + { + *(.data .data.* .gnu.linkonce.d.*) + KEEP (*(.gnu.linkonce.d.*personality*)) + SORT(CONSTRUCTORS) + } :hdata + .data1 : { *(.data1)} :hdata + .toc1 ALIGN(8) : { *(.toc1) } :hdata + .opd ALIGN(8) : { KEEP (*(.opd)) } :hdata + .got ALIGN(8) : { *(.got .toc) } :hdata + /* We want the small data sections together, so single-instruction offsets + can access them all, and initialized data all before uninitialized, so + we can shorten the on-disk segment size. */ + .sdata : + { + *(.sdata .sdata.* .gnu.linkonce.s.*) + } :hdata + _edata = .; PROVIDE (edata = .); + __bss_start = .; + .tocbss ALIGN(8) : { *(.tocbss)} :hdata + .sbss : + { + *(.dynsbss) + *(.sbss .sbss.* .gnu.linkonce.sb.*) + *(.scommon) + } :hdata + .plt : { *(.plt) } :hdata + .bss : + { + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(COMMON) + /* + * Align here to ensure that the .bss section occupies space up to + * _end. Additionally (for huge pages) align to a segment boundary. + * This ensures that no normal page mappings will be created in this + * segment (after the bss) which could interfere with remapping. + * + * XXX: This ALIGN will need to be extended to handle the case where + * ends above 1T -- in which case the alignment should be 1T. + */ + . = ALIGN(256*1024*1024); + } :hdata + _end = .; + PROVIDE (end = .); +/*. = DATA_SEGMENT_END (.);*/ + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /DISCARD/ : { *(.note.GNU-stack) } +} diff --git a/default/libhugetlbfs/libhugetlbfs/ldscripts/elf_i386.xB b/default/libhugetlbfs/libhugetlbfs/ldscripts/elf_i386.xB new file mode 100644 index 0000000..43fe51c --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/ldscripts/elf_i386.xB @@ -0,0 +1,200 @@ +/* Linker script for normal executables with BSS in hugepages */ +OUTPUT_FORMAT("elf32-i386", "elf32-i386", + "elf32-i386") +OUTPUT_ARCH(i386) +ENTRY(_start) +SEARCH_DIR("/usr/i486-linux-gnu/lib"); SEARCH_DIR("/usr/local/lib"); SEARCH_DIR("/lib"); SEARCH_DIR("/usr/lib"); +INPUT(-lhugetlbfs); +/* Do we need any of these for elf? + __DYNAMIC = 0; */ +PHDRS +{ + headers PT_PHDR PHDRS ; + interp PT_INTERP ; + text PT_LOAD FILEHDR PHDRS ; + data PT_LOAD ; + htlb PT_LOAD FLAGS (0x00100007); + dynamic PT_DYNAMIC ; + note PT_NOTE ; + gnu_stack PT_GNU_STACK ; + /* this is the value of PT_GNU_EH_FRAME as defined in + usr/include/elf.h but binutils does not recognize that identifier + as it does other PT_ constants. */ + eh_frame_hdr 1685382480 FLAGS (0x00000004); +} +SECTIONS +{ + /* Read-only sections, merged into text segment: */ + __executable_start = 0x08048000; . = 0x08048000 + SIZEOF_HEADERS; + .interp : { *(.interp) } :text :interp + .note.SuSE : { *(.note.SuSE) } :text :note + .note.ABI-tag : { *(.note.ABI-tag) } :text :note + .note.gnu.build-id : { *(.note.gnu.build-id) } :text :note + .hash : { *(.hash) } :text + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + .rel.init : { *(.rel.init) } + .rela.init : { *(.rela.init) } + .rel.text : { *(.rel.text .rel.text.* .rel.gnu.linkonce.t.*) } + .rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) } + .rel.fini : { *(.rel.fini) } + .rela.fini : { *(.rela.fini) } + .rel.rodata : { *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*) } + .rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) } + .rel.data.rel.ro : { *(.rel.data.rel.ro*) } + .rela.data.rel.ro : { *(.rel.data.rel.ro*) } + .rel.data : { *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*) } + .rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) } + .rel.tdata : { *(.rel.tdata .rel.tdata.* .rel.gnu.linkonce.td.*) } + .rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) } + .rel.tbss : { *(.rel.tbss .rel.tbss.* .rel.gnu.linkonce.tb.*) } + .rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) } + .rel.ctors : { *(.rel.ctors) } + .rela.ctors : { *(.rela.ctors) } + .rel.dtors : { *(.rel.dtors) } + .rela.dtors : { *(.rela.dtors) } + .rel.got : { *(.rel.got) } + .rela.got : { *(.rela.got) } + .rel.bss : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) } + .rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) } + .rel.plt : { *(.rel.plt) } + .rela.plt : { *(.rela.plt) } + .init : + { + KEEP (*(.init)) + } =0x90909090 + .plt : { *(.plt) } + .text : + { + *(.text .stub .text.* .gnu.linkonce.t.*) + KEEP (*(.text.*personality*)) + /* .gnu.warning sections are handled specially by elf32.em. */ + *(.gnu.warning) + } =0x90909090 + .fini : + { + KEEP (*(.fini)) + } =0x90909090 + PROVIDE (__etext = .); + PROVIDE (_etext = .); + PROVIDE (etext = .); + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } + .rodata1 : { *(.rodata1) } + .eh_frame_hdr : { *(.eh_frame_hdr) } :data :eh_frame_hdr + /* .eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) } + .gcc_except_table : ONLY_IF_RO { KEEP (*(.gcc_except_table)) *(.gcc_except_table.*) } */ + /* Adjust the address for the data segment. We want to adjust up to + the same address within the page on the next page up. */ + . = ALIGN (0x1000) - ((0x1000 - .) & (0x1000 - 1)); . = DATA_SEGMENT_ALIGN (0x1000, 0x1000); + /* Exception handling */ + .eh_frame : /* ONLY_IF_RW */ { KEEP (*(.eh_frame)) } :data + .gcc_except_table : /* ONLY_IF_RW */ { KEEP (*(.gcc_except_table)) *(.gcc_except_table.*) } + /* Thread Local Storage sections */ + .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } + .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } + /* Ensure the __preinit_array_start label is properly aligned. We + could instead move the label definition inside the section, but + the linker would then create the section even if it turns out to + be empty, which isn't pretty. */ + . = ALIGN(32 / 8); + PROVIDE (__preinit_array_start = .); + .preinit_array : { KEEP (*(.preinit_array)) } + PROVIDE (__preinit_array_end = .); + PROVIDE (__init_array_start = .); + .init_array : { KEEP (*(.init_array)) } + PROVIDE (__init_array_end = .); + PROVIDE (__fini_array_start = .); + .fini_array : { KEEP (*(.fini_array)) } + PROVIDE (__fini_array_end = .); + .ctors : + { + /* gcc uses crtbegin.o to find the start of + the constructors, so we make sure it is + first. Because this is a wildcard, it + doesn't matter if the user does not + actually link against crtbegin.o; the + linker won't look for a file to match a + wildcard. The wildcard also means that it + doesn't matter which directory crtbegin.o + is in. */ + KEEP (*crtbegin*.o(.ctors)) + /* We don't want to include the .ctor section from + from the crtend.o file until after the sorted ctors. + The .ctor section from the crtend file contains the + end of ctors marker and it must be last */ + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + } + .dtors : + { + KEEP (*crtbegin*.o(.dtors)) + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + } + .jcr : { KEEP (*(.jcr)) } + .data.rel.ro : { *(.data.rel.ro.local) *(.data.rel.ro*) } + .dynamic : { *(.dynamic) } :dynamic :data + .got : { *(.got.plt) *(.got) } + /*. = DATA_SEGMENT_RELRO_END (12, .);*/ + .data : + { + *(.data .data.* .gnu.linkonce.d.*) + KEEP (*(.gnu.linkonce.d.*personality*)) + SORT(CONSTRUCTORS) + } + .data1 : { *(.data1) } + _edata = .; + PROVIDE (edata = .); + . = ALIGN(32 / 8); + . = DATA_SEGMENT_END (.); + /* Hugepage area */ + . = ALIGN(0x1000000); /* Align to 16MB (4MB hugepage size, plus some slack in case of larger hugepages in future */ + __bss_start = .; + .bss : + { + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(COMMON) + } :htlb + _end = .; + PROVIDE (end = .); + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /DISCARD/ : { *(.note.GNU-stack) } +} diff --git a/default/libhugetlbfs/libhugetlbfs/ldscripts/elf_i386.xBDT b/default/libhugetlbfs/libhugetlbfs/ldscripts/elf_i386.xBDT new file mode 100644 index 0000000..d72aebe --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/ldscripts/elf_i386.xBDT @@ -0,0 +1,198 @@ +/* Linker script for normal executables with text, data and BSS in hugepages */ +OUTPUT_FORMAT("elf32-i386", "elf32-i386", + "elf32-i386") +OUTPUT_ARCH(i386) +ENTRY(_start) +SEARCH_DIR("/usr/i486-linux-gnu/lib"); SEARCH_DIR("/usr/local/lib"); SEARCH_DIR("/lib"); SEARCH_DIR("/usr/lib"); +INPUT(-lhugetlbfs); +/* Do we need any of these for elf? + __DYNAMIC = 0; */ +PHDRS +{ + headers PT_PHDR PHDRS ; + interp PT_INTERP ; + htext PT_LOAD FILEHDR PHDRS FLAGS (0x00100005); + hdata PT_LOAD FLAGS (0x00100007); + dynamic PT_DYNAMIC ; + note PT_NOTE ; + gnu_stack PT_GNU_STACK ; + /* this is the value of PT_GNU_EH_FRAME as defined in + usr/include/elf.h but binutils does not recognize that identifier + as it does other PT_ constants. */ + eh_frame_hdr 1685382480 FLAGS (0x00000004); +} +SECTIONS +{ + /* Read-only sections, merged into text segment: */ + /* Different from the normal origin addres, because we need to make + * it hugepage aligned */ + __executable_start = 0x08000000; . = 0x08000000 + SIZEOF_HEADERS; + .interp : { *(.interp) } :htext :interp + .note.SuSE : { *(.note.SuSE) } :htext :note + .note.ABI-tag : { *(.note.ABI-tag) } :htext :note + .note.gnu.build-id : { *(.note.gnu.build-id) } :htext :note + .hash : { *(.hash) } :htext + .dynsym : { *(.dynsym) } :htext + .dynstr : { *(.dynstr) } :htext + .gnu.version : { *(.gnu.version) } :htext + .gnu.version_d : { *(.gnu.version_d) } :htext + .gnu.version_r : { *(.gnu.version_r) } :htext + .rel.init : { *(.rel.init) } :htext + .rela.init : { *(.rela.init) } :htext + .rel.text : { *(.rel.text .rel.text.* .rel.gnu.linkonce.t.*) } :htext + .rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) } :htext + .rel.fini : { *(.rel.fini) } :htext + .rela.fini : { *(.rela.fini) } :htext + .rel.rodata : { *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*) } :htext + .rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) } :htext + .rel.data.rel.ro : { *(.rel.data.rel.ro*) } :htext + .rela.data.rel.ro : { *(.rel.data.rel.ro*) } :htext + .rel.data : { *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*) } :htext + .rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) } :htext + .rel.tdata : { *(.rel.tdata .rel.tdata.* .rel.gnu.linkonce.td.*) } :htext + .rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) } :htext + .rel.tbss : { *(.rel.tbss .rel.tbss.* .rel.gnu.linkonce.tb.*) } :htext + .rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) } :htext + .rel.ctors : { *(.rel.ctors) } :htext + .rela.ctors : { *(.rela.ctors) } :htext + .rel.dtors : { *(.rel.dtors) } :htext + .rela.dtors : { *(.rela.dtors) } :htext + .rel.got : { *(.rel.got) } :htext + .rela.got : { *(.rela.got) } :htext + .rel.bss : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) } :htext + .rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) } :htext + .rel.plt : { *(.rel.plt) } :htext + .rela.plt : { *(.rela.plt) } :htext + .init : + { + KEEP (*(.init)) + } :htext =0x90909090 + .plt : { *(.plt) } + .text : + { + *(.text .stub .text.* .gnu.linkonce.t.*) + KEEP (*(.text.*personality*)) + /* .gnu.warning sections are handled specially by elf32.em. */ + *(.gnu.warning) + } :htext =0x90909090 + .fini : + { + KEEP (*(.fini)) + } :htext =0x90909090 + PROVIDE (__etext = .); + PROVIDE (_etext = .); + PROVIDE (etext = .); + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } :htext + .rodata1 : { *(.rodata1) } :htext + .eh_frame_hdr : { *(.eh_frame_hdr) } :htext :eh_frame_hdr + /* .eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) } :htext + .gcc_except_table : ONLY_IF_RO { KEEP (*(.gcc_except_table)) *(.gcc_except_table.*) } :htext */ + + /* We don't maintain address congruence here, because saving + * hugepages is more important than saving executable size. */ + . = ALIGN (0x1000000); /* Align to 16MB (4MB hugepage size, plus some slack in case of larger hugepages in future */ + /* Exception handling */ + .eh_frame : /* ONLY_IF_RW */ { KEEP (*(.eh_frame)) } :hdata + .gcc_except_table : /* ONLY_IF_RW */ { KEEP (*(.gcc_except_table)) *(.gcc_except_table.*) } :hdata + /* Thread Local Storage sections */ + .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } :hdata + .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } :hdata + /* Ensure the __preinit_array_start label is properly aligned. We + could instead move the label definition inside the section, but + the linker would then create the section even if it turns out to + be empty, which isn't pretty. */ + . = ALIGN(32 / 8); + PROVIDE (__preinit_array_start = .); + .preinit_array : { KEEP (*(.preinit_array)) } :hdata + PROVIDE (__preinit_array_end = .); + PROVIDE (__init_array_start = .); + .init_array : { KEEP (*(.init_array)) } :hdata + PROVIDE (__init_array_end = .); + PROVIDE (__fini_array_start = .); + .fini_array : { KEEP (*(.fini_array)) } :hdata + PROVIDE (__fini_array_end = .); + .ctors : + { + /* gcc uses crtbegin.o to find the start of + the constructors, so we make sure it is + first. Because this is a wildcard, it + doesn't matter if the user does not + actually link against crtbegin.o; the + linker won't look for a file to match a + wildcard. The wildcard also means that it + doesn't matter which directory crtbegin.o + is in. */ + KEEP (*crtbegin*.o(.ctors)) + /* We don't want to include the .ctor section from + from the crtend.o file until after the sorted ctors. + The .ctor section from the crtend file contains the + end of ctors marker and it must be last */ + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + } :hdata + .dtors : + { + KEEP (*crtbegin*.o(.dtors)) + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + } :hdata + .jcr : { KEEP (*(.jcr)) } :hdata + .data.rel.ro : { *(.data.rel.ro.local) *(.data.rel.ro*) } :hdata + .dynamic : { *(.dynamic) } :dynamic :hdata + .got : { *(.got.plt) *(.got) } :hdata + .data : + { + *(.data .data.* .gnu.linkonce.d.*) + KEEP (*(.gnu.linkonce.d.*personality*)) + SORT(CONSTRUCTORS) + } :hdata + .data1 : { *(.data1) } :hdata + _edata = .; + PROVIDE (edata = .); + . = ALIGN(32 / 8); + __bss_start = .; + .bss : + { + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(COMMON) + } :hdata + _end = .; + PROVIDE (end = .); + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /DISCARD/ : { *(.note.GNU-stack) } +} diff --git a/default/libhugetlbfs/libhugetlbfs/ldscripts/elf_x86_64.xB b/default/libhugetlbfs/libhugetlbfs/ldscripts/elf_x86_64.xB new file mode 100644 index 0000000..ed21a2c --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/ldscripts/elf_x86_64.xB @@ -0,0 +1,202 @@ +/* Linker script for normal executables with BSS in hugepages */ +OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", + "elf64-x86-64") +OUTPUT_ARCH(i386:x86-64) +ENTRY(_start) +SEARCH_DIR("/usr/x86_64-linux-gnu/lib64"); SEARCH_DIR("/usr/local/lib64"); SEARCH_DIR("/lib64"); SEARCH_DIR("/usr/lib64"); SEARCH_DIR("/usr/x86_64-linux-gnu/lib"); SEARCH_DIR("/usr/local/lib"); SEARCH_DIR("/lib"); SEARCH_DIR("/usr/lib"); +INPUT(-lhugetlbfs); +/* Do we need any of these for elf? + __DYNAMIC = 0; */ +PHDRS +{ + headers PT_PHDR PHDRS ; + interp PT_INTERP ; + text PT_LOAD FILEHDR PHDRS ; + data PT_LOAD ; + htlb PT_LOAD FLAGS (0x00100007); + dynamic PT_DYNAMIC ; + note PT_NOTE ; + gnu_stack PT_GNU_STACK ; + /* this is the value of PT_GNU_EH_FRAME as defined in + usr/include/elf.h but binutils does not recognize that identifier + as it does other PT_ constants. */ + eh_frame_hdr 1685382480 FLAGS (0x00000004); +} +SECTIONS +{ + /* Read-only sections, merged into text segment: */ + __executable_start = 0x400000; . = 0x400000 + SIZEOF_HEADERS; + .interp : { *(.interp) } :text :interp + .note.SuSE : { *(.note.SuSE) } :text :note + .note.ABI-tag : { *(.note.ABI-tag) } :text :note + .note.gnu.build-id : { *(.note.gnu.build-id) } :text :note + .hash : { *(.hash) } :text + .dynsym : { *(.dynsym) } :text + .dynstr : { *(.dynstr) } :text + .gnu.version : { *(.gnu.version) } :text + .gnu.version_d : { *(.gnu.version_d) } :text + .gnu.version_r : { *(.gnu.version_r) } :text + .rel.init : { *(.rel.init) } :text + .rela.init : { *(.rela.init) } :text + .rel.text : { *(.rel.text .rel.text.* .rel.gnu.linkonce.t.*) } :text + .rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) } :text + .rel.fini : { *(.rel.fini) } :text + .rela.fini : { *(.rela.fini) } :text + .rel.rodata : { *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*) } :text + .rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) } :text + .rel.data.rel.ro : { *(.rel.data.rel.ro*) } :text + .rela.data.rel.ro : { *(.rel.data.rel.ro*) } :text + .rel.data : { *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*) } :text + .rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) } :text + .rel.tdata : { *(.rel.tdata .rel.tdata.* .rel.gnu.linkonce.td.*) } :text + .rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) } :text + .rel.tbss : { *(.rel.tbss .rel.tbss.* .rel.gnu.linkonce.tb.*) } :text + .rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) } :text + .rel.ctors : { *(.rel.ctors) } :text + .rela.ctors : { *(.rela.ctors) } :text + .rel.dtors : { *(.rel.dtors) } :text + .rela.dtors : { *(.rela.dtors) } :text + .rel.got : { *(.rel.got) } :text + .rela.got : { *(.rela.got) } :text + .rel.bss : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) } :text + .rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) } :text + .rel.plt : { *(.rel.plt) } :text + .rela.plt : { *(.rela.plt) } :text + .init : + { + KEEP (*(.init)) + } :text =0x90909090 + .plt : { *(.plt)} :text + .text : + { + *(.text .stub .text.* .gnu.linkonce.t.*) + KEEP (*(.text.*personality*)) + /* .gnu.warning sections are handled specially by elf32.em. */ + *(.gnu.warning) + } :text =0x90909090 + .fini : + { + KEEP (*(.fini)) + } :text =0x90909090 + PROVIDE (__etext = .); + PROVIDE (_etext = .); + PROVIDE (etext = .); + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } :text + .rodata1 : { *(.rodata1) } :text + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) } :text + .gcc_except_table : ONLY_IF_RO { KEEP (*(.gcc_except_table)) *(.gcc_except_table.*) } :text + /* Adjust the address for the data segment. We want to adjust up to + the same address within the page on the next page up. */ + . = ALIGN (0x100000) - ((0x100000 - .) & (0x100000 - 1)); . = DATA_SEGMENT_ALIGN (0x100000, 0x1000); + /* Exception handling */ + .eh_frame : ONLY_IF_RW { KEEP (*(.eh_frame)) } :data + .gcc_except_table : ONLY_IF_RW { KEEP (*(.gcc_except_table)) *(.gcc_except_table.*) } :data + /* Thread Local Storage sections */ + .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } :data + .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } :data + /* Ensure the __preinit_array_start label is properly aligned. We + could instead move the label definition inside the section, but + the linker would then create the section even if it turns out to + be empty, which isn't pretty. */ + . = ALIGN(64 / 8); + PROVIDE (__preinit_array_start = .); + .preinit_array : { KEEP (*(.preinit_array)) } :data + PROVIDE (__preinit_array_end = .); + PROVIDE (__init_array_start = .); + .init_array : { KEEP (*(.init_array)) } :data + PROVIDE (__init_array_end = .); + PROVIDE (__fini_array_start = .); + .fini_array : { KEEP (*(.fini_array)) } :data + PROVIDE (__fini_array_end = .); + .ctors : + { + /* gcc uses crtbegin.o to find the start of + the constructors, so we make sure it is + first. Because this is a wildcard, it + doesn't matter if the user does not + actually link against crtbegin.o; the + linker won't look for a file to match a + wildcard. The wildcard also means that it + doesn't matter which directory crtbegin.o + is in. */ + KEEP (*crtbegin*.o(.ctors)) + /* We don't want to include the .ctor section from + from the crtend.o file until after the sorted ctors. + The .ctor section from the crtend file contains the + end of ctors marker and it must be last */ + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + } :data + .dtors : + { + KEEP (*crtbegin*.o(.dtors)) + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + } :data + .jcr : { KEEP (*(.jcr)) } :data + .data.rel.ro : { *(.data.rel.ro.local) *(.data.rel.ro*) } :data + .dynamic : { *(.dynamic) } :dynamic :data + .got : { *(.got) } :data + . = DATA_SEGMENT_RELRO_END (24, .); + .got.plt : { *(.got.plt) } :data + .data : + { + *(.data .data.* .gnu.linkonce.d.*) + KEEP (*(.gnu.linkonce.d.*personality*)) + SORT(CONSTRUCTORS) + } :data + .data1 : { *(.data1) } :data + _edata = .; + PROVIDE (edata = .); + __bss_start = .; + . = ALIGN(64 / 8); + . = DATA_SEGMENT_END (.); + /* Hugepage area */ + . = ALIGN(0x1000000); /* Align to 16MB (2MB hugepage size, plus some slack in case of larger hugepages in future */ + __bss_start = .; + .bss : + { + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(COMMON) + } :htlb + _end = .; + PROVIDE (end = .); + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /DISCARD/ : { *(.note.GNU-stack) } +} diff --git a/default/libhugetlbfs/libhugetlbfs/ldscripts/elf_x86_64.xBDT b/default/libhugetlbfs/libhugetlbfs/ldscripts/elf_x86_64.xBDT new file mode 100644 index 0000000..1855202 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/ldscripts/elf_x86_64.xBDT @@ -0,0 +1,202 @@ +/* Linker script for normal executables with text data and BSS in hugepages */ +OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", + "elf64-x86-64") +OUTPUT_ARCH(i386:x86-64) +ENTRY(_start) +SEARCH_DIR("/usr/x86_64-linux-gnu/lib64"); SEARCH_DIR("/usr/local/lib64"); SEARCH_DIR("/lib64"); SEARCH_DIR("/usr/lib64"); SEARCH_DIR("/usr/x86_64-linux-gnu/lib"); SEARCH_DIR("/usr/local/lib"); SEARCH_DIR("/lib"); SEARCH_DIR("/usr/lib"); +INPUT(-lhugetlbfs); +/* Do we need any of these for elf? + __DYNAMIC = 0; */ +PHDRS +{ + headers PT_PHDR PHDRS ; + interp PT_INTERP ; + htext PT_LOAD FILEHDR PHDRS FLAGS (0x00100005); + hdata PT_LOAD FLAGS (0x00100007); + dynamic PT_DYNAMIC ; + note PT_NOTE ; + gnu_stack PT_GNU_STACK ; + /* this is the value of PT_GNU_EH_FRAME as defined in + usr/include/elf.h but binutils does not recognize that identifier + as it does other PT_ constants. */ + eh_frame_hdr 1685382480 FLAGS (0x00000004); +} +SECTIONS +{ + /* Read-only sections, merged into text segment: */ + /* Different from the normal origin address, because we make it 16MB + * aligned, in case of future larger hugepages */ + __executable_start = 0x1000000; . = 0x1000000 + SIZEOF_HEADERS; + .interp : { *(.interp) } :interp :htext + .hash : { *(.hash) } :htext + .note.SuSE : { *(.note.SuSE) } :htext :note + .note.ABI-tag : { *(.note.ABI-tag) } :htext :note + .note.gnu.build-id : { *(.note.gnu.build-id) } :htext :note + .dynsym : { *(.dynsym) } :htext + .dynstr : { *(.dynstr) } :htext + .gnu.version : { *(.gnu.version) } :htext + .gnu.version_d : { *(.gnu.version_d) } :htext + .gnu.version_r : { *(.gnu.version_r) } :htext + .rel.init : { *(.rel.init) } :htext + .rela.init : { *(.rela.init) } :htext + .rel.text : { *(.rel.text .rel.text.* .rel.gnu.linkonce.t.*) } :htext + .rela.text : { *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) } :htext + .rel.fini : { *(.rel.fini) } :htext + .rela.fini : { *(.rela.fini) } :htext + .rel.rodata : { *(.rel.rodata .rel.rodata.* .rel.gnu.linkonce.r.*) } :htext + .rela.rodata : { *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) } :htext + .rel.data.rel.ro : { *(.rel.data.rel.ro*) } :htext + .rela.data.rel.ro : { *(.rel.data.rel.ro*) } :htext + .rel.data : { *(.rel.data .rel.data.* .rel.gnu.linkonce.d.*) } :htext + .rela.data : { *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) } :htext + .rel.tdata : { *(.rel.tdata .rel.tdata.* .rel.gnu.linkonce.td.*) } :htext + .rela.tdata : { *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) } :htext + .rel.tbss : { *(.rel.tbss .rel.tbss.* .rel.gnu.linkonce.tb.*) } :htext + .rela.tbss : { *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) } :htext + .rel.ctors : { *(.rel.ctors) } :htext + .rela.ctors : { *(.rela.ctors) } :htext + .rel.dtors : { *(.rel.dtors) } :htext + .rela.dtors : { *(.rela.dtors) } :htext + .rel.got : { *(.rel.got) } :htext + .rela.got : { *(.rela.got) } :htext + .rel.bss : { *(.rel.bss .rel.bss.* .rel.gnu.linkonce.b.*) } :htext + .rela.bss : { *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) } :htext + .rel.plt : { *(.rel.plt) } :htext + .rela.plt : { *(.rela.plt) } :htext + .init : + { + KEEP (*(.init)) + } :htext =0x90909090 + .plt : { *(.plt) } :htext + .text : + { + *(.text .stub .text.* .gnu.linkonce.t.*) + KEEP (*(.text.*personality*)) + /* .gnu.warning sections are handled specially by elf32.em. */ + *(.gnu.warning) + } :htext =0x90909090 + .fini : + { + KEEP (*(.fini)) + } :htext =0x90909090 + PROVIDE (__etext = .); + PROVIDE (_etext = .); + PROVIDE (etext = .); + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } :htext + .rodata1 : { *(.rodata1) } :htext + .eh_frame_hdr : { *(.eh_frame_hdr) } :htext :eh_frame_hdr + .eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) } :htext + .gcc_except_table : ONLY_IF_RO { KEEP (*(.gcc_except_table)) *(.gcc_except_table.*) } :htext + /* We don't maintain address congruence here, because saving + * hugepages is more important than saving executable size. */ + . = ALIGN (0x1000000); /* Align to 16MB (4MB hugepage size, plus some slack in case of larger hugepages in future */ + /* Exception handling */ + .eh_frame : ONLY_IF_RW { KEEP (*(.eh_frame)) } :hdata + .gcc_except_table : ONLY_IF_RW { KEEP (*(.gcc_except_table)) *(.gcc_except_table.*) } :hdata + /* Thread Local Storage sections */ + .tdata : { *(.tdata .tdata.* .gnu.linkonce.td.*) } :hdata + .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } :hdata + /* Ensure the __preinit_array_start label is properly aligned. We + could instead move the label definition inside the section, but + the linker would then create the section even if it turns out to + be empty, which isn't pretty. */ + . = ALIGN(64 / 8); + PROVIDE (__preinit_array_start = .); + .preinit_array : { KEEP (*(.preinit_array)) } :hdata + PROVIDE (__preinit_array_end = .); + PROVIDE (__init_array_start = .); + .init_array : { KEEP (*(.init_array)) } :hdata + PROVIDE (__init_array_end = .); + PROVIDE (__fini_array_start = .); + .fini_array : { KEEP (*(.fini_array)) } :hdata + PROVIDE (__fini_array_end = .); + .ctors : + { + /* gcc uses crtbegin.o to find the start of + the constructors, so we make sure it is + first. Because this is a wildcard, it + doesn't matter if the user does not + actually link against crtbegin.o; the + linker won't look for a file to match a + wildcard. The wildcard also means that it + doesn't matter which directory crtbegin.o + is in. */ + KEEP (*crtbegin*.o(.ctors)) + /* We don't want to include the .ctor section from + from the crtend.o file until after the sorted ctors. + The .ctor section from the crtend file contains the + end of ctors marker and it must be last */ + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + } :hdata + .dtors : + { + KEEP (*crtbegin*.o(.dtors)) + KEEP (*(EXCLUDE_FILE (*crtend*.o ) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + } :hdata + .jcr : { KEEP (*(.jcr)) } :hdata + .data.rel.ro : { *(.data.rel.ro.local) *(.data.rel.ro*) } :hdata + .dynamic : { *(.dynamic) } :dynamic :hdata + .got : { *(.got) } :hdata + .got.plt : { *(.got.plt) } :hdata + .data : + { + *(.data .data.* .gnu.linkonce.d.*) + KEEP (*(.gnu.linkonce.d.*personality*)) + SORT(CONSTRUCTORS) + } :hdata + .data1 : { *(.data1) } :hdata + _edata = .; + PROVIDE (edata = .); + __bss_start = .; + .bss : + { + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(COMMON) + /* Align here to ensure that the .bss section occupies space up to + _end. Align after .bss to ensure correct alignment even if the + .bss section disappears because there are no input sections. */ + . = ALIGN(64 / 8); + } :hdata + . = ALIGN(64 / 8); + _end = .; + PROVIDE (end = .); + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /DISCARD/ : { *(.note.GNU-stack) } +} diff --git a/default/libhugetlbfs/libhugetlbfs/libhugetlbfs_debug.h b/default/libhugetlbfs/libhugetlbfs/libhugetlbfs_debug.h new file mode 100644 index 0000000..cd490ad --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/libhugetlbfs_debug.h @@ -0,0 +1,42 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 IBM Corporation. + * Author: Andy Whitcroft + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef _LIBHUGETLBFS_DEBUG_H +#define _LIBHUGETLBFS_DEBUG_H + +/* Severe, unrecoverable errors */ +#define ERROR(...) REPORT(1, "ERROR", ##__VA_ARGS__) +#define ERROR_CONT(...) REPORT_CONT(1, "ERROR", ##__VA_ARGS__) + +/* A condition that is recoverable, but may result in altered semantics */ +#define WARNING(...) REPORT(2, "WARNING", ##__VA_ARGS__) +#define WARNING_CONT(...) REPORT_CONT(2, "WARNING", ##__VA_ARGS__) + +/* Detailed information about normal library operations */ +#define INFO(...) REPORT(3, "INFO", ##__VA_ARGS__) +#define INFO_CONT(...) REPORT_CONT(3, "INFO", ##__VA_ARGS__) + +/* Diagnostic information used for debugging problems */ +#define DEBUG(...) REPORT(4, "DEBUG", ##__VA_ARGS__) +#define DEBUG_CONT(...) REPORT_CONT(4, "DEBUG", ##__VA_ARGS__) + +#define VERBOSITY_MAX 4 +#define VERBOSITY_DEFAULT 2 + +#endif diff --git a/default/libhugetlbfs/libhugetlbfs/libhugetlbfs_internal.h b/default/libhugetlbfs/libhugetlbfs/libhugetlbfs_internal.h new file mode 100644 index 0000000..ae8d7bc --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/libhugetlbfs_internal.h @@ -0,0 +1,210 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * This file should only contain definitions of functions, data types, and + * constants which are used internally within the libhugetlbfs library. + * + * All external functions declared here are library static and must be + * internalised using a define of the following form: + * + * #define foo __lh_foo + */ + +#ifndef _LIBHUGETLBFS_INTERNAL_H +#define _LIBHUGETLBFS_INTERNAL_H + +#include <elf.h> +#include <link.h> +#include <limits.h> +#include <stdbool.h> + +#ifndef __LIBHUGETLBFS__ +#error This header should not be included by library users. +#endif /* __LIBHUGETLBFS__ */ + +#include "libhugetlbfs_privutils.h" +#include "libhugetlbfs_testprobes.h" + +#define stringify_1(x) #x +#define stringify(x) stringify_1(x) + +#define ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1)) +#define ALIGN_UP(x,a) ALIGN(x,a) +#define ALIGN_DOWN(x,a) ((x) & ~((a) - 1)) + +#if defined(__powerpc64__) || defined (__powerpc__) +#define SLICE_LOW_SHIFT 28 +#define SLICE_HIGH_SHIFT 40 +#elif defined(__ia64__) +#define SLICE_HIGH_SHIFT 63 +#endif + +struct libhugeopts_t { + int sharing; + bool min_copy; + bool shrink_ok; + bool shm_enabled; + bool no_reserve; + bool map_hugetlb; + bool thp_morecore; + unsigned long force_elfmap; + char *ld_preload; + char *elfmap; + char *share_path; + char *features; + char *path; + char *def_page_size; + char *morecore; + char *heapbase; +}; + +/* + * When adding a library local variable externalise the symbol as + * normal, plus add a #define of the form below. This define effectively + * renames the routine into the local namespace __lh_* which is forced + * local in the linker script version.lds. Some routines may need to be + * exported in the utilities library these are marked __pu_* which marks + * them for export in libhugetlbfs_privutils; their definitions should + * appear in libhugetlbfs_privutils.h rather than here. + */ +#define __hugetlbfs_verbose __lh___hugetlbfs_verbose +extern int __hugetlbfs_verbose; +#define __hugetlbfs_debug __lh___hugetlbfs_debug +extern bool __hugetlbfs_debug; +#define __hugetlbfs_prefault __lh___hugetlbfs_prefault +extern bool __hugetlbfs_prefault; +#define hugetlbfs_setup_env __lh_hugetlbfs_setup_env +extern void hugetlbfs_setup_env(); +#define hugetlbfs_setup_elflink __lh_hugetlbfs_setup_elflink +extern void hugetlbfs_setup_elflink(); +#define hugetlbfs_setup_morecore __lh_hugetlbfs_setup_morecore +extern void hugetlbfs_setup_morecore(); +#define hugetlbfs_setup_debug __lh_hugetlbfs_setup_debug +extern void hugetlbfs_setup_debug(); +#define setup_mounts __lh_setup_mounts +extern void setup_mounts(); +#define setup_features __lh_setup_features +extern void setup_features(); +#define hugetlbfs_check_priv_resv __lh_hugetlbfs_check_priv_resv +extern void hugetlbfs_check_priv_resv(); +#define hugetlbfs_check_safe_noreserve __lh_hugetlbfs_check_safe_noreserve +extern void hugetlbfs_check_safe_noreserve(); +#define hugetlbfs_check_map_hugetlb __lh_hugetblfs_check_map_hugetlb +extern void hugetlbfs_check_map_hugetlb(); +#define __hugetlbfs_hostname __lh___hugetlbfs_hostname +extern char __hugetlbfs_hostname[]; +#define hugetlbfs_prefault __lh_hugetlbfs_prefault +extern int hugetlbfs_prefault(void *addr, size_t length); +#define parse_page_size __lh_parse_page_size +extern long parse_page_size(const char *str); +#define probe_default_hpage_size __lh__probe_default_hpage_size +extern void probe_default_hpage_size(void); +#define debug_show_page_sizes __lh__debug_show_page_sizes +extern void debug_show_page_sizes(void); +#define hugetlbfs_setup_kernel_page_size __lh__hugetlbfs_setup_kernel_page_size +extern void hugetlbfs_setup_kernel_page_size(void); +#define __hugetlb_opts __lh__hugetlb_opts +extern struct libhugeopts_t __hugetlb_opts; + +#ifndef REPORT_UTIL +#define REPORT_UTIL "libhugetlbfs" +#endif + +#define VERBOSE_ERROR 1 +#define VERBOSE_WARNING 2 +#define VERBOSE_INFO 3 +#define VERBOSE_DEBUG 4 + +#ifndef REPORT +#define REPORT(level, prefix, format, ...) \ + do { \ + if (__hugetlbfs_verbose >= level) { \ + fprintf(stderr, REPORT_UTIL); \ + if (__hugetlbfs_verbose >= VERBOSE_DEBUG) \ + fprintf(stderr, " [%s:%d]", \ + __hugetlbfs_hostname, getpid()); \ + fprintf(stderr, ": " prefix ": " format, \ + ##__VA_ARGS__); \ + fflush(stderr); \ + } \ + } while (0) + +#define REPORT_CONT(level, prefix, ...) \ + do { \ + if (__hugetlbfs_verbose >= level) { \ + fprintf(stderr, ##__VA_ARGS__); \ + fflush(stderr); \ + } \ + } while (0) +#endif + +#include "libhugetlbfs_debug.h" + +#if defined(__powerpc64__) && !defined(__LP64__) +/* Older binutils fail to provide this symbol */ +#define __LP64__ +#endif + +/* Multiple huge page size support */ +struct hpage_size { + unsigned long pagesize; + char mount[PATH_MAX+1]; +}; + +struct hpage_pool { + unsigned long pagesize; + unsigned long minimum; + unsigned long maximum; + unsigned long size; + int is_default; +}; + +#define size_to_smaller_unit __lh_size_to_smaller_unit +extern unsigned long long size_to_smaller_unit(unsigned long long size); + +#define file_read_ulong __lh_file_read_ulong +extern long file_read_ulong(char *file, const char *tag); +#define file_write_ulong __lh_file_write_ulong +extern int file_write_ulong(char *file, unsigned long val); + +#define hpool_sizes __lh_hpool_sizes +extern int hpool_sizes(struct hpage_pool *, int); +#define get_pool_size __lh_get_pool_size +extern int get_pool_size(long, struct hpage_pool *); + +/* Arch-specific callbacks */ +extern int direct_syscall(int sysnum, ...); +extern ElfW(Word) plt_extrasz(ElfW(Dyn) *dyntab); + +#define MEMINFO "/proc/meminfo" +#define PROC_HUGEPAGES_DIR "/proc/sys/vm/" +#define SYSFS_HUGEPAGES_DIR "/sys/kernel/mm/hugepages/" + +#define hugetlbfs_test_pagesize __lh_hugetlbfs_test_pagesize +long hugetlbfs_test_pagesize(const char *mount); + +/* Diagnoses/debugging only functions */ +#define dump_proc_pid_maps __lh_dump_proc_pid_maps +long dump_proc_pid_maps(void); + +#define plt_extrasz __lh_plt_extrasz +ElfW(Word) plt_extrasz(ElfW(Dyn) *dyntab); + +#endif /* _LIBHUGETLBFS_INTERNAL_H */ diff --git a/default/libhugetlbfs/libhugetlbfs/libhugetlbfs_privutils.h b/default/libhugetlbfs/libhugetlbfs/libhugetlbfs_privutils.h new file mode 100644 index 0000000..149e42f --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/libhugetlbfs_privutils.h @@ -0,0 +1,94 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * This file should only contain definitions of functions, data types, and + * constants which are part of the internal private utilities interfaces. + * These are exposed only to utilities and tests within the source, this is + * not a public interface nor part of the libhugetlfs API. + * + * All functions declared external here must be externalised using a define + * of the following form: + * + * #define foo __pu_foo + */ + +#ifndef _LIBHUGETLBFS_PRIVUTILS_H +#define _LIBHUGETLBFS_PRIVUTILS_H + +/* Hugetlb pool counter operations */ +/* Keys for reading hugetlb pool counters */ +enum { /* The number of pages of a given size that ... */ + HUGEPAGES_TOTAL, /* are allocated to the pool */ + HUGEPAGES_TOTAL_MEMPOL, /* are allocated following the NUMA mempolicy */ + HUGEPAGES_FREE, /* are not in use */ + HUGEPAGES_RSVD, /* are reserved for possible future use */ + HUGEPAGES_SURP, /* are allocated to the pool on demand */ + HUGEPAGES_OC, /* can be allocated on demand - maximum */ + HUGEPAGES_MAX_COUNTERS, +}; +#define get_huge_page_counter __pu_get_huge_page_counter +long get_huge_page_counter(long pagesize, unsigned int counter); +#define set_huge_page_counter __pu_set_huge_page_counter +int set_huge_page_counter(long pagesize, unsigned int counter, + unsigned long val); +#define set_nr_hugepages __pu_set_nr_hugepages +int set_nr_hugepages(long pagesize, unsigned long val); +#define set_nr_overcommit_hugepages __pu_set_nr_overcommit_hugepages +int set_nr_overcommit_hugepages(long pagesize, unsigned long val); + +#define kernel_has_hugepages __pu_kernel_has_hugepages +int kernel_has_hugepages(void); + +#define kernel_has_overcommit __pu_kernel_has_overcommit +int kernel_has_overcommit(void); + +#define read_meminfo __pu_read_meminfo +long read_meminfo(const char *tag); + +#define kernel_default_hugepage_size __pu_kernel_default_hugepage_size +long kernel_default_hugepage_size(void); + +#define read_nr_overcommit __pu_read_nr_overcommit +long read_nr_overcommit(long page_size); + +#define restore_overcommit_pages __pu_restore_overcommit_pages +void restore_overcommit_pages(long page_size, long oc_pool); + +/* Kernel feature testing */ +/* This enum defines the bits in a feature bitmask */ +enum { + /* Reservations are created for private mappings */ + HUGETLB_FEATURE_PRIVATE_RESV, + + /* Whether use of MAP_NORESERVE is safe or can result in OOM */ + HUGETLB_FEATURE_SAFE_NORESERVE, + + /* If the kernel has the ability to mmap(MAP_HUGETLB)*/ + HUGETLB_FEATURE_MAP_HUGETLB, + + HUGETLB_FEATURE_NR, +}; +#define hugetlbfs_test_feature __pu_hugetlbfs_test_feature +int hugetlbfs_test_feature(int feature_code); + +#define test_compare_kver __pu_test_compare_kver +int test_compare_kver(const char *a, const char *b); + +#endif /* _LIBHUGETLBFS_PRIVUTILS_H */ diff --git a/default/libhugetlbfs/libhugetlbfs/libhugetlbfs_testprobes.h b/default/libhugetlbfs/libhugetlbfs/libhugetlbfs_testprobes.h new file mode 100644 index 0000000..6e01da4 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/libhugetlbfs_testprobes.h @@ -0,0 +1,39 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 IBM Corporation, author: Andy Whitcroft + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * This file should only contain definitions of functions, data types, and + * constants which are part of the internal library test probe interfaces. + * These are exposed only to utilities and tests within the source, this is + * not a public interface nor part of the libhugetlfs API. + * + * All functions declared external here must be externalised using a define + * of the following form: + * + * #define foo __tp_foo + */ + +#ifndef _LIBHUGETLBFS_TESTPROBES_H +#define _LIBHUGETLBFS_TESTPROBES_H + +#define kernel_default_hugepage_size_reset \ + __tp_kernel_default_hugepage_size_reset +void kernel_default_hugepage_size_reset(void); + +#endif /* _LIBHUGETLBFS_TESTPROBES_H */ diff --git a/default/libhugetlbfs/libhugetlbfs/localversion b/default/libhugetlbfs/libhugetlbfs/localversion new file mode 100755 index 0000000..5d50aca --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/localversion @@ -0,0 +1,90 @@ +#!/bin/sh +# +# libhugetlbfs - Easy use of Linux hugepages +# Copyright (C) 2006 Andy Whitcroft, IBM Corporation +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public License +# as published by the Free Software Foundation; either version 2.1 of +# the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +if [ "$#" -lt 1 ]; then + echo "Usage: localversion <version file> <source> ..." 1>&2 + exit 1 +fi +file="$1" + +if [ -f "$file" ]; then + read current_version <"$file" +fi +version="$current_version" + +modified=0 + +# GIT: check for a git tree. +mod=`git diff-index HEAD 2>/dev/null` +if [ "$?" -eq 0 ]; then + # This is a GIT repo, see if it was modified. + if [ "$mod" != "" ]; then + modified=1 + else + # Subtle, if we are in a git archive and the repository + # is clean then update the time on the version file + # thus ensuring it will be correct in any tarball. + touch "$file" + fi + + # Try and get a real "tag relative" version name for it. + version=`git describe --tags HEAD 2>&1` + if [ "$?" -ne 0 ]; then + # ok, desperation just take the commit id. + version=`git log | awk '{ print $2; exit }'` + version="commit<$version>" + fi + +else + if [ ! -f "$file" ]; then + echo 1>&2 "$0: ERROR: unversioned tarball" + echo "#error UNVERSIONED tarball" >"$file.h" + exit 1 + fi + + # No version control, use the modification times + # of the source. + for s in "$@" + do + if [ "$s" -nt "$file" ]; then + modified=1 + fi + done +fi + +if [ "$current_version" != "$version" ]; then + echo "version update: $version" + echo "$version" >"$file" +fi + +# Update the c-define for this version, take the modification +# flags into account. +version_modified="$version" +[ "$modified" -eq 1 ] && version_modified="$version_modified (modified)" + +if [ -f "$file.h" ]; then + read d1 current_version_modified <"$file.h" +fi +if [ "$current_version_modified" != "$version_modified" ]; then + echo "version string: $version_modified" + echo "// $version_modified" >"$file.h" + echo "#define VERSION \"$version_modified\"" >>"$file.h" +fi + +exit 0 diff --git a/default/libhugetlbfs/libhugetlbfs/man/cpupcstat.8 b/default/libhugetlbfs/libhugetlbfs/man/cpupcstat.8 new file mode 100644 index 0000000..d84a726 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/man/cpupcstat.8 @@ -0,0 +1,117 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH CPUCPSTAT 8 "9 June, 2009" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp <n> insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +cpupcstat \- Measure the DTLB miss rate +.SH SYNOPSIS +.B cpupcstat [options] [target] +.SH DESCRIPTION +\fBcpupcstat\fP uses oprofile to measure the DTLB miss rate of a +specified application or the kernel. It configures oprofile to count the +number of DTLB misses, optionally starts the \fBtarget\fP, and reports on the +miss rate over a specified interval as \fBtarget\fP executes. + +The following options can be used to configure how \fBcpupcstat\fP works: + +.TP +.B --vmlinux </path/to/vmlinux> + +This allows the user to specify where the appropriate vmlinux file is for their +kernel. If this is not specified, /boot/vmlinux\-\`uname \-r\` will be used. + +.TP +.B --delay <seconds> + +This allows the user to specify the reporting interval. The default is 10 +seconds. + +.TP +.B --target-global + +Gather statistics for all processes and the kernel running in the system. + +.TP +.B --target-pid <pid> + +This allows the user to specify the pid of a process already that is already +running. If this option is specified, \fBtarget\fP will be ignored. + +.TP +.B --real-target <real-target> + +Use this to specify the real name of the program to monitor if the \fBtarget\fP +is a launcher script. When this is specified, \fBtarget\fP is executed but the +report will be for \fBreal-target\fP. + +.TP +.B --time-limit <sec> + +This option sets the time limit for monitoring. If this is specified the +\fBtarget\fP or \fBpid\fP will only be monitored for \fBsec\fP seconds. The +default continues monitoring while \fBtarget\fP or \fBpid\fP are still alive. + +.TP +.B --kernel + +This allows the user to request DTLB miss rate data be collected for the kernel +as well as the \fBtarget\fP. + +.TP +.B --misses-per-instruction + +This option requests that the ratio of instructions retired per TLB miss. + +.TP +.B --misses-per-cycle + +This option requests that the ratio of CPU cycles per TLB miss. + +.TP +.B --time-servicing + +This option requests that the percentage of CPU cycles spent servicing TLB +misses is displayed when \fBcpupcstat\fB exits. To use this option the cost +in CPU cycles for a single TLB miss must be specified using either the +\fB--cost-config\fB option or the \fBtlbmiss_cost.sh\fB script. + +.TP +.B --cost-config </path/to/config> + +This option tells \fBcpupcstat\fB that the cost in CPU cycles of a TLB miss +can be found in the specified file, it should be specified as: + +TLB_MISS_COST=XX + +Where XX is the cost in cycles. This option is only used with the +\fB--time-servicing\fB option. + +.TP +.B --force-oprofile + +\fBcpupcstat\fP prefers the perf tool for data collection, only using oprofile +if perf is not present or supported. This option will force \fBcpupcstat\fP to +use oprofile for data collection. + +.SH SEE ALSO +.I oprofile(1) +.I perf(1) +.I tlbmiss_cost.sh(8) +.br +.SH AUTHORS +Eric B Munson <ebmunson@xxxxxxxxxx> is the primary author. See the documentation +for other contributors. + diff --git a/default/libhugetlbfs/libhugetlbfs/man/get_huge_pages.3 b/default/libhugetlbfs/libhugetlbfs/man/get_huge_pages.3 new file mode 100644 index 0000000..86d03c9 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/man/get_huge_pages.3 @@ -0,0 +1,73 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH GET_HUGE_PAGES 3 "October 8, 2008" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp <n> insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +get_huge_pages, free_huge_pages \- Allocate and free hugepages +.SH SYNOPSIS +.B #include <hugetlbfs.h> +.br + +.br +.B void *get_huge_pages(size_t len, ghp_t flags); +.br +.B void free_huge_pages(void *ptr); +.SH DESCRIPTION + +\fBget_huge_pages()\fP allocates a memory region \fBlen\fP bytes in size +backed by hugepages. Hugepages may be of benefit to applications that use +large amounts of address space and suffer a performance hit due to TLB +misses. Wall-clock time or oprofile can be used to determine if there is +a performance benefit from using hugepages or not. + +The \fBlen\fP parameter must be hugepage-aligned. In the current +implementation, only the default hugepage size may be allocated via this +function. Use \fBgethugepagesize\fP to discover what the alignment should +be. + +The \fBflags\fP argument changes the behaviour +of the function. Flags may be or'd together. + +.TP +.B GHP_DEFAULT + +Allocate a region of memory of the requested length backed by hugepages of +the default hugepage size. Return NULL if sufficient pages are not available + +.PP + +\fBfree_huge_pages()\fP frees a region of memory allocated by +\fBget_huge_pages()\fP. The behaviour of the function if another pointer +is used, valid or otherwise, is undefined. + +.SH RETURN VALUE + +On success, a pointer is returned to the allocated memory. On +error, NULL is returned. errno will be set based on what the failure of +mmap() was due to. + +.SH SEE ALSO +.I oprofile(1) +, +.I gethugepagesize(3) +, +.I get_hugepage_region(3) +, +.I libhugetlbfs(7) +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. + diff --git a/default/libhugetlbfs/libhugetlbfs/man/get_hugepage_region.3 b/default/libhugetlbfs/libhugetlbfs/man/get_hugepage_region.3 new file mode 100644 index 0000000..63fd40c --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/man/get_hugepage_region.3 @@ -0,0 +1,88 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH GET_HUGEPAGE_REGION 3 "November 7, 2008" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp <n> insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +get_hugepage_region, free_hugepage_region \- Allocate and free regions of memory that use hugepages where possible +.SH SYNOPSIS +.B #include <hugetlbfs.h> +.br + +.br +.B void *get_hugepage_region(size_t len, ghr_t flags); +.br +.B void free_hugepage_region(void *ptr); +.SH DESCRIPTION + +\fBget_hugepage_region()\fP allocates a memory region \fBlen\fP bytes in size +backed by hugepages. Hugepages may be of benefit to applications that use +large amounts of address space and suffer a performance hit due to TLB +misses. Wall-clock time or oprofile can be used to determine if there is +a performance benefit from using hugepages or not. + +Unlike \fBget_huge_pages()\fP, \fBlen\fP does not have to be hugepage-aligned +although memory may be wasted due to alignment. The caller may also specify +that base pages be used in the event there are no hugepages available. + +The \fBflags\fP argument changes the behaviour of the function. Flags may +be or'd together. + +.TP +.B GHR_FALLBACK +Use base pages if there are an insufficient number of huge pages. + +.TP +.B GHR_STRICT +Use hugepages or return NULL. + +.TP +.B GHR_COLOR +When specified, bytes that would be wasted due to alignment are used to +color the buffer by offsetting it by a random cacheline within the hugepage. +This avoids a performance problem whereby multiple buffers use the same +cache lines at the same offsets. If it is not important that the start of the +buffer be page-aligned, specify this flag. + +.TP +.B GHR_DEFAULT +The library chooses a sensible combination of flags for allocating a region of +memory. The current default is: + GHR_FALLBACK | GHR_COLOR + +.PP + +\fBfree_hugepage_region()\fP frees a region of memory allocated by +\fBget_hugepage_region()\fP. The behaviour of the function if another +pointer is used, valid or otherwise, is undefined. + +.SH RETURN VALUE + +On success, a pointer is returned for to the allocated memory. On +error, NULL is returned. errno will be set based on what the failure of +mmap() was due to. + +.SH SEE ALSO +.I oprofile(1) +, +.I gethugepagesize(3) +, +.I get_huge_pages(3) +, +.I libhugetlbfs(7) +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. + diff --git a/default/libhugetlbfs/libhugetlbfs/man/gethugepagesizes.3 b/default/libhugetlbfs/libhugetlbfs/man/gethugepagesizes.3 new file mode 100644 index 0000000..02442a9 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/man/gethugepagesizes.3 @@ -0,0 +1,69 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH GETHUGEPAGESIZES 3 "October 10, 2008" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp <n> insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +gethugepagesizes - Get the system supported huge page sizes +.SH SYNOPSIS +.B #include <hugetlbfs.h> +.br + +.br +int gethugepagesizes(long pagesizes[], int n_elem); + +.SH DESCRIPTION + +The gethugepagesizes() function returns either the number of system supported +huge page sizes or the sizes themselves. If \fBpagesizes\fP is NULL and +\fBn_elem\fP is 0, then the number of huge pages the system supports is +returned. Otherwise, \fBpagesizes\fP is filled with at most \fBn_elem\fP +page sizes. + +.SH RETURN VALUE + +On success, either the number of huge page sizes supported by the system or +the number of huge page sizes stored in pagesizes is returned. On failure, +-1 is returned and errno is set appropriately. + +.SH ERRORS + +.TP +.B ERRNO +\fBn_elem\fP is less than zero or \fBn_elem\fP is greater than zero and +\fBpagesizes\fP is NULL. + +.PP + +Also see opendir(3) for other possible values for errno. This error occurs +when the sysfs directory exists but cannot be opened. + +.SH NOTES + +This call will return all huge page sizes as reported by the kernel. +Not all of these sizes may be usable by the programmer since mount points +may not be available for all sizes. To test whether a size will be usable +by \fBlibhugetlbfs\fP, hugetlbfs_find_path_for_size() can be called on a +specific size to see if a mount point is configured. + +.SH SEE ALSO +.I oprofile(1), +.I opendir(3), +.I libhugetlbfs(7) + +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. + diff --git a/default/libhugetlbfs/libhugetlbfs/man/getpagesizes.3 b/default/libhugetlbfs/libhugetlbfs/man/getpagesizes.3 new file mode 100644 index 0000000..fb3a387 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/man/getpagesizes.3 @@ -0,0 +1,70 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH GETPAGESIZES 3 "October 10, 2008" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp <n> insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +getpagesizes - Get the system supported huge page sizes +.SH SYNOPSIS +.B #include <hugetlbfs.h> +.br + +.br +int getpagesizes(long pagesizes[], int n_elem); + +.SH DESCRIPTION + +The getpagesizes() function returns either the number of system supported +page sizes or the sizes themselves. If \fBpagesizes\fP is NULL and +\fBn_elem\fP is 0, then the number of pages the system supports is +returned. Otherwise, \fBpagesizes\fP is filled with at most \fBn_elem\fP +page sizes. + +.SH RETURN VALUE + +On success, either the number of page sizes supported by the system or the +number of page sizes stored in \fBpagesizes\fP is returned. On failure, +-1 is returned and errno is set appropriately. + +.SH ERRORS + +.TP +.B ERRNO +\fBn_elem\fP is less than zero or \fBn_elem\fP is greater than zero and +\fBpagesizes\fP is NULL. + +.PP + +Also see opendir(3) for other possible values for errno. This error occurs +when the sysfs directory exists but cannot be opened. + +.SH NOTES + +This call will return all page sizes as reported by the kernel. Not all of +these sizes may be usable by the programmer since mount points may not be +available for the huge page sizes. To test whether a size will be usable +by \fBlibhugetlbfs\fP, hugetlbfs_find_path_for_size() can be called on a +specific size to see if a mount point is configured. + +.SH SEE ALSO +.I oprofile(1), +.I opendir(3), +.I gethugepagesizes(3), +.I libhugetlbfs(7) + +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. + diff --git a/default/libhugetlbfs/libhugetlbfs/man/hugeadm.8 b/default/libhugetlbfs/libhugetlbfs/man/hugeadm.8 new file mode 100644 index 0000000..28de91e --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/man/hugeadm.8 @@ -0,0 +1,294 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH HUGEADM 8 "October 1, 2009" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp <n> insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +hugeadm \- Configure the system huge page pools +.SH SYNOPSIS +.B hugeadm [options] +.SH DESCRIPTION + +\fBhugeadm\fP displays and configures the systems huge page pools. The size +of the pools is set as a minimum and maximum threshold. The minimum value +is allocated up front by the kernel and guaranteed to remain as hugepages +until the pool is shrunk. If a maximum is set, the system will dynamically +allocate pages if applications request more hugepages than the minimum size +of the pool. There is no guarantee that more pages than this minimum pool +size can be allocated. + +The following options create mounts hugetlbfs mount points. + +.TP +.B --create-mounts + +This creates mount points for each supported huge page size under +/var/lib/hugetlbfs. After creation they are mounts and are owned by +root:root with permissions set to 770. Each mount point is named +pagesize-<size in bytes>. + +.TP +.B --create-user-mounts=<user> + +This creates mount points for each supported huge page size under +/var/lib/hugetlbfs/user/<user>. Mount point naming is the same as +--create-mounts. After creation they are mounted and are owned by +<user>:root with permissions set to 700. + +.TP +.B --create-group-mounts=<group> + +This creates mount points for each supported huge page size under +/var/lib/hugetlbfs/group/<group>. Mount point naming is the same as +--create-mounts. After creation they are mounted and are owned by +root:<group> with permissions set to 070. + +.TP +.B --create-global-mounts + +This creates mount points for each supported huge page size under +/var/lib/hugetlbfs/global. Mount point naming is the same as +--create-mounts. After creation they are mounted and are owned by +root:root with permissions set to 1777. + +The following options affect how mount points are created. + +.TP +.B --max-size + +This option is used in conjunction with --create-*-mounts. It limits the +maximum amount of memory used by files within the mount point rounded up +to the nearest huge page size. This can be used for example to grant +different huge page quotas to individual users or groups. + +.TP +.B --max-inodes + +This option is used in conjunction with --create-*-mounts. It limits the +number of inodes (e.g. files) that can be created on the new mount points. +This limits the number of mappings that can be created on a mount point. It +could be used for example to limit the number of application instances that +used a mount point as long as it was known how many inodes each application +instance required. + +The following options display information about the pools. + +.TP +.B --pool-list + +This displays the Minimum, Current and Maximum number of huge pages in the pool +for each pagesize supported by the system. The "Minimum" value is the size of +the static pool and there will always be at least this number of hugepages in +use by the system, either by applications or kept by the kernel in a reserved +pool. The "Current" value is the number of hugepages currently in use, either +by applications or stored on the kernels free list. The "Maximum" value is the +largest number of hugepages that can be in use at any given time. + +.TP +.B --set-recommended-min_free_kbytes + +Fragmentation avoidance in the kernel depends on avoiding pages of different +mobility types being mixed with a pageblock arena - typically the size of +the default huge page size. The more mixing that occurs, the less likely +the huge page pool will be able to dynamically resize. The easiest means of +avoiding mixing is to increase /proc/sys/vm/min_free_kbytes. This parameter +sets min_free_kbytes to a recommended value to aid fragmentation avoidance. + +.TP +.B --set-recommended-shmmax + +The maximum shared memory segment size should be set to at least the size +of the largest shared memory segment size you want available for applications +using huge pages, via /proc/sys/kernel/shmmax. Optionally, it can be set +automatically to match the maximum possible size of all huge page allocations +and thus the maximum possible shared memory segment size, using this switch. + +.TP +.B --set-shm-group=<gid|groupname> + +Users in the group specified in /proc/sys/vm/hugetlb_shm_group are granted +full access to huge pages. The sysctl takes a numeric gid, but this hugeadm +option can set it for you, using either a gid or group name. + +.TP +.B --page-sizes + +This displays every page size supported by the system and has a pool +configured. + +.TP +.B --page-sizes-all + +This displays all page sizes supported by the system, even if no pool is +available. + +.TP +.B --list-all-mounts + +This displays all active mount points for hugetlbfs. + +.PP +The following options configure the pool. + +.TP +.B --pool-pages-min=<size|DEFAULT>:[+|-]<pagecount|memsize<G|M|K>> + +This option sets or adjusts the Minimum number of hugepages in the pool for +pagesize \fBsize\fP. \fBsize\fP may be specified in bytes or in kilobytes, +megabytes, or gigabytes by appending K, M, or G respectively, or as DEFAULT, +which uses the system's default huge page size for \fBsize\fP. The pool size +adjustment can be specified by \fBpagecount\fP pages or by \fBmemsize\fP, if +postfixed with G, M, or K, for gigabytes, megabytes, or kilobytes, +respectively. If the adjustment is specified via \fBmemsize\fP, then the +\fBpagecount\fP will be calculated for you, based on page size \fBsize\fP. +The pool is set to \fBpagecount\fP pages if + or - are not specified. If ++ or - are specified, then the size of the pool will adjust by that amount. +Note that there is no guarantee that the system can allocate the hugepages +requested for the Minimum pool. The size of the pools should be checked after +executing this command to ensure they were successful. + +.TP +.B --obey-numa-mempol + +This option requests that allocation of huge pages to the static pool with +\fB--pool-pages-min\fP obey the NUMA memory policy of the current process. This +policy can be explicitly specified using numactl or inherited from a parent +process. + +.TP +.B --pool-pages-max=<size|DEFAULT>:[+|-]<pagecount|memsize<G|M|K>> + +This option sets or adjusts the Maximum number of hugepages. Note that while +the Minimum number of pages are guaranteed to be available to applications, +there is not guarantee that the system can allocate the pages on demand when +the number of huge pages requested by applications is between the Minimum and +Maximum pool sizes. See --pool-pages-min for usage syntax. + +.TP +.B --enable-zone-movable + +This option enables the use of the MOVABLE zone for the allocation of +hugepages. This zone is created when kernelcore= or movablecore= are specified +on the kernel command line but the zone is not used for the allocation of +huge pages by default as the intended use for the zone may be to guarantee +that memory can be off-lined and hot-removed. The kernel guarantees that +the pages within this zone can be reclaimed unlike some kernel buffers +for example. Unless pages are locked with mlock(), the hugepage pool can +grow to at least the size of the movable zone once this option is set. Use +sysctl to permanently enable the use of the MOVABLE zone for the allocation +of huge pages. + +.TP +.B --disable-zone-movable + +This option disables the use of the MOVABLE zone for the future allocation of +huge pages. Note that existing huge pages are not reclaimed from the zone. +Use sysctl to permanently disable the use of the MOVABLE zone for the +allocation of huge pages. + +.TP +.B --hard + + +This option is specified with --pool-pages-min to retry allocations multiple +times on failure to allocate the desired count of pages. It initially tries +to resize the pool up to 5 times and continues to try if progress is being +made towards the resize. + +.TP +.B --add-temp-swap<=count> + +This options is specified with --pool-pages-min to initialize a temporary +swap file for the duration of the pool resize. When increasing the size of +the pool, it can be necessary to reclaim pages so that contiguous memory is +freed and this often requires swap to be successful. Swap is only created for +a positive resize, and is then removed once the resize operation is completed. +The default swap size is 5 huge pages, the optional argument <count> sets +the swap size to <count> huge pages. + +.TP +.B --add-ramdisk-swap + +This option is specified with --pool-pages-min to initialize swap in memory +on ram disks. When increasing the size of the pool, it can be necessary to +reclaim pages so that contiguous memory is freed and this often requires swap +to be successful. If there isn't enough free disk space, swap can be +initialized in RAM using this option. If the size of one ramdisk is not +greater than the huge page size, then swap is initialized on multiple ramdisks. +Swap is only created for a positive resize, and by default is removed once +the resize operation is completed. + +.TP +.B --persist + +This option is specified with the --add-temp-swap or --add-ramdisk-swap to +make the swap space persist after the resize operation is completed. The swap +spaces can later be removed manually using the swapoff command. + +.PP +The following options tune the transparent huge page usage + +.TP +.B --thp-always + +Enable transparent huge pages always + +.TP +.B --thp-madvise + +Enable transparent huge pages only on madvised regions + +.TP +.B --thp-never + +Disable transparent huge pages + +.TP +.B --thp-khugepaged-pages <pages to scan> + +Configure the number of pages that khugepaged should scan on each pass + +.TP +.B --thp-khugepaged-scan-sleep <milliseconds> + +Configure how many milliseconds khugepaged should wait between passes + +.TP +.B --thp-khugepages-alloc-sleep <milliseconds> + +Configure how many milliseconds khugepaged should wait after failing to +allocate a huge page to throttle the next attempt. + +.PP +The following options affect the verbosity of libhugetlbfs. + +.TP +.B --verbose <level>, -v + +The default value for the verbosity level is 1 and the range of the value can +be set with --verbose from 0 to 99. The higher the value, the more verbose the +library will be. 0 is quiet and 3 will output much debugging information. The +verbosity level is increased by one each time -v is specified. + +.SH SEE ALSO +.I oprofile(1), +.I pagesize(1), +.I libhugetlbfs(7), +.I hugectl(8), +.br +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. + diff --git a/default/libhugetlbfs/libhugetlbfs/man/hugectl.8 b/default/libhugetlbfs/libhugetlbfs/man/hugectl.8 new file mode 100644 index 0000000..6ee70f2 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/man/hugectl.8 @@ -0,0 +1,141 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH HUGECTL 8 "October 10, 2008" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp <n> insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +hugectl \- Control policy for backing text, data and malloc() with hugepages +.SH SYNOPSIS +.B hugectl [options] command {arguments} +.SH DESCRIPTION + +\fBhugectl\fP runs processes with a specific policy for backing memory regions +with hugepages. The use of hugepages benefit applications that use large +amounts of address space and suffer a performance hit due to TLB misses. +Policy is enforced by \fBlibhugetlbfs\fP and \fBhugectl\fP configures the +environment based on the options provided. Wall-clock time or oprofile can +be used to determine if there is a performance benefit from using hugepages +or not. + +To effectively back text/data, the target process must be relinked to align +the ELF segments on a hugepage boundary. The library also supports more options +for the control of memory regions than are exposed by the \fBhugectl\fP +utility. See the \fBlibhugetlbfs\fP manual page for more details. + +The following options affect what memory regions are backed by hugepages. + +.TP +.B --text[=<size>],--data[=<size>],--bss[=<size>] +Back the text, data or BSS segments with hugepages, optionally with pages +of the specified size. To be effective, the process must be relinked +as described in the HOWTO to align the ELF segments. It is possible to +partially back segments using the HUGETLB_FORCE_ELMAP environment variable +as described in the \fBlibhugetlbfs\fP manual page. + +.TP +.B --heap[=<size>] +Use the glibc morecore hook to back malloc() with hugepages, optionally +with pages of the specified size. Note that this does not affect brk() +segments and applications that use custom allocators potentially do not +use hugepages for their heap even with this option specified. + +.TP +.B --shm +This option overrides shmget() to back shared memory regions with hugepages +if possible. Segment size requests will be aligned to fit to the default +hugepage size region. + +.TP +.B --share-text +Request that multiple application instances share text segments that are +backed with huge pages. This option sets the environment variable +HUGETLB_SHARE to 1. + +.TP +.B --thp +Align heap regions to huge page size for promotion by khugepaged. For more +information on transparent huge pages see linux-2.6/Documentation/transhuge.txt + +.PP +The following options affect how \fBhugectl\fP behaves. + +.TP +.B --no-preload +Disable any pre-loading of the \fBlibhugetlbfs\fP library. This may be necessary +if only the heap is being backed by hugepages and the application is already +linked against the library. \fBhugectl\fP may pre-load the library by mistake +and this option prevents that. + +.TP +.B --force-preload +Force pre-loading of the \fBlibhugetlbfs\fP library. This option is used when +the segments of the binary are aligned to the hugepage boundary of interest +but the binary is not linked against libhugetlbfs. This is useful on PPC64 +where binaries are aligned to 64K as required by the ABI and the kernel is +using a 4K base pagesize. + +.TP +.B --no-reserve +By default, huge pages are reserved at mmap() time so future faults will +succeed. This avoids unexpected application but some applications depend +on memory overcommit to create large sparse mappings. For this type of +application, this switch will create huge page backed mappings without a +reservation if the kernel is recent enough to make this operation safe. +Use this option with extreme care as in the event huge pages are not +available when the mapping is faulted, the application will be killed. + +.TP +.B --dry-run +Instead of running the process, the \fBhugectl\fP utility will describe what +environment variables it set for \fBlibhugetlbfs\fP. This is useful if +additional environment variables are to be set and a launcher shell script is +being developed. + +.TP +.B --library-use-path +By default, \fBhugectl\fP will use the version of \fBlibhugetlbfs\fP it was +installed with, even if this is not in the LD_LIBRARY_PATH environment. Using +this option forces \fBhugectl\fP to use the version of \fBlibhugetlbfs\fP +installed in the library system path. + +.TP +.B --library-path <path> +This option forces \fBhugectl\fP to use the \fBlibhugetlbfs\fP libraries within +the given prefix. + +.PP +The following options affect the verbosity of libhugetlbfs. + +.TP +.B --verbose <level>, -v +The default value for the verbosity level is 1 and the range of the value can +be set with --verbose from 0 to 99. The higher the value, the more verbose the +library will be. 0 is quiet and 3 will output much debugging information. The +verbosity level is increased by one each time -v is specified. + +.TP +.B -q +The -q option will drecease the verbosity level by 1 each time it is specified +to a minimum of 0. + +.SH SEE ALSO +.I oprofile(1), +.I hugeadm(7), +.I libhugetlbfs(7) +.br +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. + diff --git a/default/libhugetlbfs/libhugetlbfs/man/hugeedit.8 b/default/libhugetlbfs/libhugetlbfs/man/hugeedit.8 new file mode 100644 index 0000000..4bcfdfc --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/man/hugeedit.8 @@ -0,0 +1,57 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH HUGEEDIT 7 "October 8, 2008" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp <n> insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +hugeedit \- Set default policy for backing text and data with hugepages +.SH SYNOPSIS +.B hugeedit binary +.br +.B hugeedit [options] binary +.SH DESCRIPTION + +\fBhugectl\fP runs processes with a specific policy for backing memory +regions with hugepages. Ordinarily when processes are relinked with +\fBlibhugetlbfs\fP using the recommended linking method, either \fBhugectl\fP +is required on each execution or environment variables must be set for +each execution. \fBhugeedit\fP can be used to set bits in the ELF header +that determine if the text or data segments are backed by default without +further intervention. + +If no arguments are specified, \fBhugeedit\fP will display what the current +defaults for each segment in a binary are. The available options are + +.TP +.B --text +Back the text segments of the binary by default. + +.TP +.B --data +Back the data segments of the binary by default + +.TP +.B --disable +Back all segments using small pages by default + +.SH SEE ALSO +.I oprofile(1), +.I libhugetlbfs(7), +.I hugectl(8) +.br +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. + diff --git a/default/libhugetlbfs/libhugetlbfs/man/libhugetlbfs.7 b/default/libhugetlbfs/libhugetlbfs/man/libhugetlbfs.7 new file mode 100644 index 0000000..14bcf04 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/man/libhugetlbfs.7 @@ -0,0 +1,212 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH LIBHUGETLBFS 7 "September 27, 2008" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp <n> insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +libhugetlbfs \- preload library to back text, data, malloc() or shared memory with hugepages +.SH SYNOPSIS +.B export [environment options] +.br +.B [LD_PRELOAD=libhugetlbfs.so] target_application +.SH DESCRIPTION + +\fBlibhugetlbfs\fP is a library that can back application text, data, malloc() +and shared memory with hugepages. This is of benefit to applications that +use large amounts of address space and suffer a performance hit due to TLB +misses. Wall-clock time or oprofile can be used to determine if there is +a performance benefit from using \fBlibhugetlbfs\fP or not. In all cases +but shared memory, a hugetlbfs mount must exist and a hugepage pool defined +for hugepages to be used. + +Some limited functionality is available for unmodified dynamically linked +applications. By preloading the library, the library can back malloc() +and shared memory, and text and data segments can be partially backed if +they are large enough. + +For the effective backing of text and data with huge pages, the application +must be linked to the library and the ELF segments correctly aligned using +the ld helpers. Once linked, malloc or shared memory can still be backed +but no pre-loading is required. See /usr/share/docs/libhugetlbfs/HOWTO for +detailed instructions on relinking applications. + +For applications that are hugepage-aware and linked to the library +\fBget_huge_pages()\fP can be used for the direct allocation of +hugepage-backed regions. + +Unless otherwise specified, \fBlibhugetlbfs\fP will use the default hugepage +size to back memory regions. The default size is the value of Hugepagesize +displayed in /proc/meminfo. The size can be specified in bytes or in +kilobytes, megabytes, or gigabytes by appending K, M, or G respectively. It +is an error to specify a invalid, unsupported, or otherwise unconfigured +huge page size. Kernel 2.6.27 or later is required to specify any pagesize +other than the default. + +See /usr/share/docs/libhugetlbfs/HOWTO for detailed instructions on how +the library should be used, particularly when relinking the application. +This manual page provides a brief synopsis of the environment variables +as a quick reference. + +The following variables affect what memory regions are backed by hugepages. In +all cases, the environment being unset implies the feature should remain +disabled. + +.TP +.B HUGETLB_MORECORE=[yes|<pagesize>] +This enables the hugepage malloc() feature, instructing libhugetlbfs to +override glibc's normal morecore() function with a hugepage version and use +it for malloc(). All application malloc() memory should come from hugepage +memory until it runs out, it will then fallback to base pages. Note that +applications that use custom allocators may not be able to back their heaps +using hugepages and this environment variable. It may be necessary to modify +the custom allocator to use \fBget_huge_pages()\fP. + +.TP +.B HUGETLB_SHM=yes +When this environment variable is set, the SHM_HUGETLB flag is added to +the shmget() call and the size parameter is aligned to back the shared +memory segment with hugepages. In the event hugepages cannot be used, base +pages will be used instead and a warning will be printed to explain the +failure. The pagesize cannot be specified with this parameter. To change +the kernels default hugepage size, use the pagesize= kernel boot parameter +(2.6.26 or later required). + +.TP +.B HUGETLB_ELFMAP=[no|[R[<=pagesize>]:[W[<=pagesize>]]] +If the application has been relinked (see the HOWTO for instructions), +this environment variable determines whether read-only, read-write, both +or no segments are backed by hugepages and what pagesize should be used. If +the recommended relinking method has been used, then \fBhugeedit\fP can be +used to automatically back the text or data by default. + +.TP +.B HUGETLB_FORCE_ELFMAP=yes +Force the use of hugepages for text and data segments even if the application +has not been relinked to align the ELF segments on a hugepage boundary. +Partial segment remapping is not guaranteed to work and the segments must be +large enough to contain at least one hugepage for the remapping to occur. + +.PP +The following options affect how libhugetlbfs behaves. + +.TP +.B HUGETLB_RESTRICT_EXE=e1:e2:...:eN +By default, libhugetlbfs will act on any program that it +is loaded with, either via LD_PRELOAD or by explicitly +linking with -lhugetlbfs. + +There are situations in which it is desirable to restrict +libhugetlbfs' actions to specific programs. For example, +some ISV applications are wrapped in a series of scripts +that invoke bash, python, and/or perl. It is more +convenient to set the environment variables related +to libhugetlbfs before invoking the wrapper scripts, +yet this has the unintended and undesirable consequence +of causing the script interpreters to use and consume +hugepages. There is no obvious benefit to causing the +script interpreters to use hugepages, and there is a +clear disadvantage: fewer hugepages are available to +the actual application. + +To address this scenario, set HUGETLB_RESTRICT_EXE to a +colon-separated list of programs to which the other +libhugetlbfs environment variables should apply. (If +not set, libhugetlbfs will attempt to apply the requested +actions to all programs.) For example, + + HUGETLB_RESTRICT_EXE=hpcc:long_hpcc + +will restrict libhugetlbfs' actions to programs named +/home/fred/hpcc and /bench/long_hpcc but not /bin/hpcc_no. + + +.TP +.B HUGETLB_MORECORE_SHRINK=yes +By default, the hugepage heap does not shrink. Shrinking is enabled by +setting this environment variable. It is disabled by default as glibc +occasionally exhibits strange behaviour if it mistakes the heap returned +by \fBlibhugetlbfs\fP as a foreign brk(). + +.TP +.B HUGETLB_NO_PREFAULT +By default \fBlibhugetlbfs\fP will prefault regions it creates to ensure they +can be referenced without receiving a SIGKILL. On kernels older than 2.6.27, +this was necessary as the system did not guarantee that future faults would +succeed on regions mapped MAP_PRIVATE. Prefaulting impacts the performance +of malloc() and can result in poor placement on NUMA systems. If it is known +the hugepage pool is large enough to run the application or the kernel is +2.6.27 or later, this environment variable should be set. + +.TP +.B HUGETLB_NO_RESERVE=yes + +By default, the kernel will reserve huge pages at mmap() time to ensure that +future faults will succeed. This avoids unexpected application failure at +fault time but some applications depend on memory overcommit to create +large sparse mappings. For this type of application, setting this environment +variable will create huge page backed mappings without a reservation. Use +this option with extreme care as in the event huge pages are not available +when the mapping is used, the application will be killed. On older kernels, +the use of this feature can trigger the OOM killer. Hence, even with this +variable set, reservations may still be used for safety. + +.TP +.B HUGETLB_MORECORE_HEAPBASE=address +\fBlibhugetlbfs\fP normally picks an address to use as the base of the heap for +malloc() automatically. This environment variable fixes which address is used. + +.TP +.B HUGETLB_PATH=<path> +The path to the hugetlbfs mount is automatically determined at run-time. In the +event there are multiple mounts and the wrong one is being selected, use this +option to select the correct one. This may be the case if an +application-specific mount with a fixed quota has been created for example. + +.TP +.B HUGETLB_SHARE=1 +By default, \fBlibhugetlbfs\fP uses unlinked hugetlbfs files to store remapped +program segment data. If the same program is started multiple times using +hugepage segments, multiple hugepages will be used to store the same program +data. The reduce this wastage, setting this environment variable will share +read-only segments between multiple invocations of a program at the cost of +the memory being used whether the applications are running or not. It is +also possible that a malicious application inferfere with other applications +executable code. See the HOWTO for more detailed information on this topic. + +.PP +The following options control the verbosity of \fBlibhugetlbfs\fP. + +.TP +.B HUGETLB_VERBOSE=<level> +The default value for this is 1 and the range of the value is from 0 to +99. The higher the value, the more verbose the output is. 0 is quiet and +3 will output much debugging information. + +.TP +.B HUGETLB_DEBUG +Once set, this will give very detailed output on what is happening in the +library and run extra diagnostics. + +.SH SEE ALSO +.I oprofile(1), +.I hugectl(8), +.I hugeedit(8), +.I get_huge_pages(3), +.I free_huge_pages(3) +.br +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. + diff --git a/default/libhugetlbfs/libhugetlbfs/man/pagesize.1 b/default/libhugetlbfs/libhugetlbfs/man/pagesize.1 new file mode 100644 index 0000000..7e6efce --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/man/pagesize.1 @@ -0,0 +1,57 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH PAGESIZE 1 "October 10, 2008" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp <n> insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +pagesize \- Print supported system page sizes +.SH SYNOPSIS +.B pagesize [options] +.SH DESCRIPTION + +The pagesize utility prints the page sizes of a page of memory in bytes, +as returned by getpagesizes(3). This is useful when creating portable shell +scripts, configuring huge page pools with hugeadm or launching applications +to use huge pages with hugectl. + +If no parameters are specified, \fBpagesize\fP prints the system base page +size as returned by \fBgetpagesize()\fP. The following parameters affect +what other pagesizes are displayed. + +.TP +.B --huge-only, -H + +Display all huge pages supported by the system as returned by +\fBgethugepagesizes()\fP. + +.TP +.B --all, -a + +Display all page sizes supported by the system. + +.SH SEE ALSO +.I oprofile(1), +.I getpagesize(2), +.I getpagesizes(3), +.I gethugepagesizes(3), +.I hugectl(7), +.I hugeadm(7), +.I libhugetlbfs(7) + +.br +.SH AUTHORS +libhugetlbfs was written by various people on the libhugetlbfs-devel +mailing list. + diff --git a/default/libhugetlbfs/libhugetlbfs/man/tlbmiss_cost.sh.8 b/default/libhugetlbfs/libhugetlbfs/man/tlbmiss_cost.sh.8 new file mode 100644 index 0000000..837dc66 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/man/tlbmiss_cost.sh.8 @@ -0,0 +1,85 @@ +.\" Hey, EMACS: -*- nroff -*- +.\" First parameter, NAME, should be all caps +.\" Second parameter, SECTION, should be 1-8, maybe w/ subsection +.\" other parameters are allowed: see man(7), man(1) +.TH TLBMISS_COST.SH 8 "16 December, 2009" +.\" Please adjust this date whenever revising the manpage. +.\" +.\" Some roff macros, for reference: +.\" .nh disable hyphenation +.\" .hy enable hyphenation +.\" .ad l left justify +.\" .ad b justify to both left and right margins +.\" .nf disable filling +.\" .fi enable filling +.\" .br insert line break +.\" .sp <n> insert n+1 empty lines +.\" for manpage-specific macros, see man(7) +.SH NAME +tlbmiss_cost.sh \- Measure the cost in CPU cycles of a TLB miss +.SH SYNOPSIS +.B tlbmiss_cost.sh [options] +.SH DESCRIPTION +\fBtlbmiss_cost.sh\fP uses oprofile or calibator to calculate the cost in CPU +cycles of servicing a TLB miss. The method used depends on the architecture +where the script is being run. On POWERPC, oprofile is used with the STREAM +benchmark (available here: http://www.cs.virginia.edu/stream/FTP/Code/stream.c). +On X86 and X86-64, calibrator is used (source available here: +http://homepages.cwi.nl/~manegold/Calibrator/v0.9e/calibrator.c) +These programs will need to be in place and available to \fBtlbmiss_cost.sh\fP +in order for \fBcpupcstat\fP to calculate the percentage of time spent servicing +TLB misses automatically. \fBtlbmiss_cost.sh\fP can fetch and build these +programs for you with the appropriate options. + +The following options can be used to configure how \fBtlbmiss_cost.sh\fP works: + +.TP +.B --calibrator </path/to/calibrator> + +This option allows the user to specify the location of the \fBcalibrator\fP +tool. If this is not specified the script will check the path and the current +working directory for \fBcalibrator\fP + +.TP +.B --stream </path/to/STREAM> + +This option allows the user to specify the location of the \fBSTREAM\fP +benchmarking tool (note that is this is not \fBstream(1)\fP). If this is not +specified the script will check the path and the current working directory for +\fBSTREAM\fP + +.TP +.B --time-servicing + +Add an additional column printing out what percentage of time was spend +servicing TLB misses. + +.TP +.B --verbose + +This option increases the verbosity of the ouput. + +.TP +.B --quiet + +This option decreases the verbosity of the output. + +.TP +.B --fetch-calibrator + +This option has the script attempt to fetch the source for \fBcalibrator\fP, +builds it, and makes it available to \fBtlbmiss_cost.sh\fP + +.TP +.B --fetch-stream + +This option has the script attempt to fetch the source for \fBSTREAM\fP, builds +it, and makes it available to \fBtlbmiss_cost.sh\fP + +.SH SEE ALSO +.I cpupcstat(8) +.br +.SH AUTHORS +Eric B Munson <ebmunson@xxxxxxxxxx> is the primary author. See the documentation +for other contributors. + diff --git a/default/libhugetlbfs/libhugetlbfs/mktarball b/default/libhugetlbfs/libhugetlbfs/mktarball new file mode 100755 index 0000000..8855204 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/mktarball @@ -0,0 +1,32 @@ +#!/bin/sh +# +# +P='mktarball' + +if [ "$#" -ne 1 ]; then + echo 1>&2 "Usage: $P <commitish>" + exit 1 +fi +commit="$1" + +if [ ! .git ]; then + echo 1>&2 "$P: not in the source tree" + exit 1 +fi + +# Get the official name for this commit. +name=`git describe --tags "$commit"` + +# Build a version file to add to the tarball, we know it is not +# modified as we just took a commit which is unmodified by definition. +tmp="/tmp/tarball-$$" +mkdir -p "$tmp/libhugetlbfs-$name" +echo "$name" >"$tmp/libhugetlbfs-$name/version" + +git archive --format=tar --prefix="libhugetlbfs-$name/" "$commit" \ + >"libhugetlbfs-$name.tar" +tar -C "$tmp" -rf "libhugetlbfs-$name.tar" "libhugetlbfs-$name/version" +gzip -9 "libhugetlbfs-$name.tar" + +# Clean up the version. +[ -d "$tmp/libhugetlbfs-$name" ] && rm -rf "$tmp" diff --git a/default/libhugetlbfs/libhugetlbfs/morecore.c b/default/libhugetlbfs/libhugetlbfs/morecore.c new file mode 100644 index 0000000..c02b11a --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/morecore.c @@ -0,0 +1,366 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <malloc.h> +#include <unistd.h> +#include <sys/mman.h> +#include <errno.h> +#include <dlfcn.h> +#include <string.h> +#include <fcntl.h> + +#include "hugetlbfs.h" + +#include "libhugetlbfs_internal.h" + +static int heap_fd; + +static void *heapbase; +static void *heaptop; +static long mapsize; +static long hpage_size; + +static long hugetlbfs_next_addr(long addr) +{ +#if defined(__powerpc64__) + return ALIGN(addr, 1L << SLICE_HIGH_SHIFT); +#elif defined(__powerpc__) + return ALIGN(addr, 1L << SLICE_LOW_SHIFT); +#elif defined(__ia64__) + if (addr < (1UL << SLICE_HIGH_SHIFT)) + return ALIGN(addr, 1UL << SLICE_HIGH_SHIFT); + else + return ALIGN(addr, hpage_size); +#else + return ALIGN(addr, hpage_size); +#endif +} + +/* + * Our plan is to ask for pages 'roughly' at the BASE. We expect and + * require the kernel to offer us sequential pages from wherever it + * first gave us a page. If it does not do so, we return the page and + * pretend there are none this covers us for the case where another + * map is in the way. This is required because 'morecore' must have + * 'sbrk' semantics, ie. return sequential, contigious memory blocks. + * Luckily, if it does not do so and we error out malloc will happily + * go back to small pages and use mmap to get them. Hurrah. + */ +static void *hugetlbfs_morecore(ptrdiff_t increment) +{ + int ret; + void *p; + long delta; + int mmap_reserve = __hugetlb_opts.no_reserve ? MAP_NORESERVE : 0; + int mmap_hugetlb = 0; + int using_default_pagesize = + (hpage_size == kernel_default_hugepage_size()); + + INFO("hugetlbfs_morecore(%ld) = ...\n", (long)increment); + + /* + * how much to grow the heap by = + * (size of heap) + malloc request - mmap'd space + */ + delta = (heaptop-heapbase) + increment - mapsize; + + INFO("heapbase = %p, heaptop = %p, mapsize = %lx, delta=%ld\n", + heapbase, heaptop, mapsize, delta); + + /* align to multiple of hugepagesize. */ + delta = ALIGN(delta, hpage_size); + +#ifdef MAP_HUGETLB + mmap_hugetlb = MAP_HUGETLB; +#endif + + if (delta > 0) { + /* growing the heap */ + + INFO("Attempting to map %ld bytes\n", delta); + + /* map in (extend) more of the file at the end of our last map */ + if (__hugetlb_opts.map_hugetlb && using_default_pagesize) + p = mmap(heapbase + mapsize, delta, PROT_READ|PROT_WRITE, + mmap_hugetlb|MAP_ANONYMOUS|MAP_PRIVATE|mmap_reserve, + heap_fd, mapsize); + else + p = mmap(heapbase + mapsize, delta, PROT_READ|PROT_WRITE, + MAP_PRIVATE|mmap_reserve, heap_fd, mapsize); + + if (p == MAP_FAILED) { + WARNING("New heap segment map at %p failed: %s\n", + heapbase+mapsize, strerror(errno)); + return NULL; + } + + /* if this is the first map */ + if (! mapsize) { + if (heapbase && (heapbase != p)) { + WARNING("Heap originates at %p instead of %p\n", + p, heapbase); + if (__hugetlbfs_debug) + dump_proc_pid_maps(); + } + /* then setup the heap variables */ + heapbase = heaptop = p; + } else if (p != (heapbase + mapsize)) { + /* Couldn't get the mapping where we wanted */ + munmap(p, delta); + WARNING("New heap segment mapped at %p instead of %p\n", + p, heapbase + mapsize); + if (__hugetlbfs_debug) + dump_proc_pid_maps(); + return NULL; + } + + /* Fault the region to ensure accesses succeed */ + if (hugetlbfs_prefault(p, delta) != 0) { + munmap(p, delta); + return NULL; + } + + /* we now have mmap'd further */ + mapsize += delta; + } else if (delta < 0) { + /* shrinking the heap */ + + if (!__hugetlb_opts.shrink_ok) { + /* shouldn't ever get here */ + WARNING("Heap shrinking is turned off\n"); + return NULL; + } + + if (!mapsize) { + WARNING("Can't shrink empty heap!\n"); + return NULL; + } + + /* + * If we are forced to change the heapaddr from the + * original brk() value we have violated brk semantics + * (which we are not supposed to do). This shouldn't + * pose a problem until glibc tries to trim the heap to an + * address lower than what we aligned heapaddr to. At that + * point the alignment "gap" causes heap corruption. + * So we don't allow the heap to shrink below heapbase. + */ + if (mapsize + delta < 0) { /* remember: delta is negative */ + WARNING("Unable to shrink heap below %p\n", heapbase); + /* unmap just what is currently mapped */ + delta = -mapsize; + /* we need heaptop + increment == heapbase, so: */ + increment = heapbase - heaptop; + } + INFO("Attempting to unmap %ld bytes @ %p\n", -delta, + heapbase + mapsize + delta); + ret = munmap(heapbase + mapsize + delta, -delta); + if (ret) { + WARNING("Unmapping failed while shrinking heap: " + "%s\n", strerror(errno)); + } else if (!__hugetlb_opts.map_hugetlb && !using_default_pagesize){ + + /* + * Now shrink the hugetlbfs file. + */ + mapsize += delta; + ret = ftruncate(heap_fd, mapsize); + if (ret) { + WARNING("Could not truncate hugetlbfs file to " + "shrink heap: %s\n", strerror(errno)); + } + } + + } + + /* heap is continuous */ + p = heaptop; + /* and we now have added this much more space to the heap */ + heaptop = heaptop + increment; + + INFO("... = %p\n", p); + return p; +} + +static void *thp_morecore(ptrdiff_t increment) +{ + void *p; + long delta; + + INFO("thp_morecore(%ld) = ...\n", (long)increment); + + delta = (heaptop - heapbase) + increment - mapsize; + delta = ALIGN(delta, hpage_size); + + if (delta > 0) { + /* + * This first time we expand the mapping we need to account for + * the initial heap mapping not necessarily being huge page + * aligned + */ + if (!mapsize) + delta = hugetlbfs_next_addr((long)heapbase + delta) - + (unsigned long)heapbase; + + INFO("Adding %ld bytes to heap\n", delta); + + p = sbrk(delta); + if (p == (void *)-1) { + WARNING("sbrk returned ENOMEM\n"); + return NULL; + } + + if (!mapsize) { + if (heapbase && (heapbase != p)) { + WARNING("Heap was expected at %p instead of %p, " + "heap has been modified by someone else!\n", + heapbase, p); + if (__hugetlbfs_debug) + dump_proc_pid_maps(); + } + heapbase = heaptop = p; + } + + mapsize += delta; +#ifdef MADV_HUGEPAGE + madvise(p, delta, MADV_HUGEPAGE); +#endif + } else if (delta < 0) { + /* shrinking the heap */ + if (!mapsize) { + WARNING("Can't shrink an empty heap\n"); + return NULL; + } + + INFO("Attempting to shrink heap by %ld bytes with sbrk\n", + -delta); + p = sbrk(delta); + if (p == (void *)-1) { + WARNING("Unable to shrink heap\n"); + return heaptop; + } + + mapsize += delta; + } + + p = heaptop; + heaptop += increment; + INFO("... = %p\n", p); + return p; +} + +void hugetlbfs_setup_morecore(void) +{ + char *ep; + unsigned long heapaddr; + + if (! __hugetlb_opts.morecore) + return; + if (strcasecmp(__hugetlb_opts.morecore, "no") == 0) { + INFO("HUGETLB_MORECORE=%s, not setting up morecore\n", + __hugetlb_opts.morecore); + return; + } + + /* + * Determine the page size that will be used for the heap. + * This can be set explicitly by setting HUGETLB_MORECORE to a valid + * page size string or by setting HUGETLB_DEFAULT_PAGE_SIZE. + */ + if (strncasecmp(__hugetlb_opts.morecore, "y", 1) == 0) + hpage_size = gethugepagesize(); + else if (__hugetlb_opts.thp_morecore) + hpage_size = kernel_default_hugepage_size(); + else + hpage_size = parse_page_size(__hugetlb_opts.morecore); + + if (hpage_size <= 0) { + if (errno == ENOSYS) + WARNING("Hugepages unavailable\n"); + else if (errno == EOVERFLOW || errno == ERANGE) + WARNING("Hugepage size too large\n"); + else if (errno == EINVAL) + WARNING("Invalid huge page size\n"); + else + WARNING("Hugepage size (%s)\n", strerror(errno)); + return; + } + + /* + * We won't need an fd for the heap mmaps if we are using MAP_HUGETLB + * or we are depending on transparent huge pages + */ + if(__hugetlb_opts.thp_morecore || (__hugetlb_opts.map_hugetlb && + hpage_size == kernel_default_hugepage_size())) { + heap_fd = -1; + } else { + if (!hugetlbfs_find_path_for_size(hpage_size)) { + WARNING("Hugepage size %li unavailable", hpage_size); + return; + } + + heap_fd = hugetlbfs_unlinked_fd_for_size(hpage_size); + if (heap_fd < 0) { + WARNING("Couldn't open hugetlbfs file for morecore\n"); + return; + } + } + + /* + * THP morecore uses sbrk to allocate more heap space, counting on the + * kernel to back the area with THP. So setting heapbase is + * meaningless if thp_morecore is used. + */ + if (!__hugetlb_opts.thp_morecore && __hugetlb_opts.heapbase) { + heapaddr = strtoul(__hugetlb_opts.heapbase, &ep, 16); + if (*ep != '\0') { + WARNING("Can't parse HUGETLB_MORECORE_HEAPBASE: %s\n", + __hugetlb_opts.heapbase); + return; + } + } else { + heapaddr = (unsigned long)sbrk(0); + if (!__hugetlb_opts.thp_morecore) + heapaddr = hugetlbfs_next_addr(heapaddr); + } + + INFO("setup_morecore(): heapaddr = 0x%lx\n", heapaddr); + + heaptop = heapbase = (void *)heapaddr; + if (__hugetlb_opts.thp_morecore) + __morecore = &thp_morecore; + else + __morecore = &hugetlbfs_morecore; + + /* Set some allocator options more appropriate for hugepages */ + + if (__hugetlb_opts.shrink_ok) + mallopt(M_TRIM_THRESHOLD, hpage_size / 2); + else + mallopt(M_TRIM_THRESHOLD, -1); + mallopt(M_TOP_PAD, hpage_size / 2); + /* we always want to use our morecore, not ordinary mmap(). + * This doesn't appear to prohibit malloc() from falling back + * to mmap() if we run out of hugepages. */ + mallopt(M_MMAP_MAX, 0); +} diff --git a/default/libhugetlbfs/libhugetlbfs/oprofile_map_events.pl b/default/libhugetlbfs/libhugetlbfs/oprofile_map_events.pl new file mode 100755 index 0000000..eb413e2 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/oprofile_map_events.pl @@ -0,0 +1,146 @@ +#!/usr/bin/perl +# This script attempts to map a high-level CPU event to the oprofile counter +# of the current CPU +# Licensed under LGPL 2.1 as packaged with libhugetlbfs +# (c) Mel Gorman 2008 + +use Getopt::Long; +use FindBin qw($Bin); +use lib "$Bin"; + +use TLBC::Report; +use strict; + +my ($arch, $cputype); +my $opt_verbose; +my $opt_event; +my $opt_cycle_factor=1; +my $opt_event_factor=1; +my $p = "oprofile_map_events.pl"; + +my $oprofile_event; +my (%map_event_name, %map_event_mask); + +# CPU events miss table +$map_event_name{"i386##dtlb_miss"} = "PAGE_WALK_TYPE:100000:0x01"; +$map_event_name{"i386##p4##timer"} = "GLOBAL_POWER_EVENTS:100000:0x01"; +$map_event_name{"i386##p4##dtlb_miss"} = "PAGE_WALK_TYPE:3000:0x01"; +$map_event_name{"i386##p4##l2cache_miss"} = "BSQ_CACHE_REFERENCE:3000:0x300"; +$map_event_name{"i386##p4-ht##timer"} = "GLOBAL_POWER_EVENTS:6000:0x01"; +$map_event_name{"i386##p4-ht##dtlb_miss"} = "PAGE_WALK_TYPE:3000:0x01"; +$map_event_name{"i386##p4-ht##l2cache_miss"} = "BSQ_CACHE_REFERENCE:6000:0x300"; +$map_event_name{"i386##core##timer"} = "CPU_CLK_UNHALTED:6000"; +$map_event_name{"i386##core##dtlb_miss"} = "DTLB_MISS:500"; +$map_event_name{"i386##core##instructions"} = "INST_RETIRED:6000"; +$map_event_name{"i386##core_2##dtlb_miss"} = "DTLB_MISSES:500:0x01"; +$map_event_name{"i386##core_2##timer"} = "CPU_CLK_UNHALTED:6000"; +$map_event_name{"i386##core_2##instructions"} = "INST_RETIRED_ANY_P:6000"; +$map_event_name{"x86-64##timer"} = "CPU_CLK_UNHALTED:100000"; +$map_event_name{"x86-64##hammer##dtlb_miss"} = "L1_AND_L2_DTLB_MISSES:100000"; +$map_event_name{"x86-64##hammer##l1cache_miss"} = "DATA_CACHE_MISSES:500"; +$map_event_name{"x86-64##hammer##l2cache_miss"} = "L2_CACHE_MISS:500"; +$map_event_name{"x86-64##family10##dtlb_miss"} = "L1_DTLB_AND_L2_DTLB_MISS:500"; +$map_event_name{"x86-64##family10##l1cache_miss"} = "DATA_CACHE_MISSES:500"; +$map_event_name{"x86-64##family10##l2cache_miss"} = "L2_CACHE_MISS:500"; +$map_event_name{"x86-64##core_2##dtlb_miss"} = "DTLB_MISSES:500:0x01"; +$map_event_name{"x86-64##core_2##timer"} = "CPU_CLK_UNHALTED:6000"; +$map_event_name{"x86-64##core_2##instructions"} = "INST_RETIRED_ANY_P:6000"; +$map_event_name{"ppc64##timer"} = "CYCLES:10000"; +$map_event_name{"ppc64##dtlb_miss"} = "PM_DTLB_MISS_GRP44:100000"; +$map_event_name{"ppc64##timer30"} = "PM_CYC_GRP30:10000"; +$map_event_name{"ppc64##tablewalk_cycles"} = "PM_DATA_TABLEWALK_CYC_GRP30:1000"; +$map_event_name{"ppc64##970MP##timer"} = "PM_CYC_GRP22:10000"; +$map_event_name{"ppc64##970MP##dtlb_miss"} = "PM_DTLB_MISS_GRP22:1000"; +$map_event_name{"ppc64##970MP##l1cache_ld_miss"} = "PM_LD_MISS_L1_GRP22:1000"; +$map_event_name{"ppc64##970MP##l1cache_st_miss"} = "PM_ST_MISS_L1_GRP22:1000"; +$map_event_name{"ppc64##970MP##timer50"} = "PM_CYC_GRP50:10000"; +$map_event_name{"ppc64##970MP##l1l2cache_miss"} = "PM_DATA_FROM_MEM_GRP50:1000"; +$map_event_name{"ppc64##970MP##timer30"} = "PM_CYC_GRP30:10000"; +$map_event_name{"ppc64##970MP##tablewalk_cycles"} = "PM_DATA_TABLEWALK_CYC_GRP30:1000"; +$map_event_name{"ppc64##power5##dtlb_miss"} = "PM_DTLB_MISS_GRP44:100000"; +$map_event_name{"ppc64##power5##tablewalk_cycles"} = "PM_DATA_TABLEWALK_CYC_GRP44:1000"; +$map_event_name{"ppc64##power4##dtlb_miss"} = "PM_DTLB_MISS_GRP9:1000"; +$map_event_name{"ppc64##power4##tablewalk_cycles"} = "PM_DATA_TABLEWALK_CYC_GRP9:1000"; +$map_event_name{"ppc64##power6##dtlb_miss"} = "PM_LSU_DERAT_MISS_GRP76:1000"; +$map_event_name{"ppc64##power6##tablewalk_cycles"} = "PM_LSU_DERAT_MISS_CYC_GRP76:1000"; +$map_event_name{"ppc64##power7##timer"} = "PM_RUN_CYC_GRP12:10000"; +$map_event_name{"ppc64##power7##timer30"} = "PM_RUN_CYC_GRP86:10000"; +$map_event_name{"ppc64##power7##dtlb_miss"} = "PM_DTLB_MISS_GRP12:1000"; +$map_event_name{"ppc64##power7##tablewalk_cycles"} = "PM_DATA_TABLEWALK_CYC_GRP86:1000"; + +GetOptions( + 'verbose' => \$opt_verbose, + 'sample-cycle-factor|c=n' => \$opt_cycle_factor, + 'sample-event-factor|e=n' => \$opt_event_factor, + 'event|e=s' => \$opt_event, + ); +setVerbose if $opt_verbose; + +if ($opt_event eq "" || $opt_event eq "default") { + print "default\n"; + exit(0); +} + +# Run --list-events to setup devices +open (SETUP, "opcontrol --list-events|") || die("Failed to exec opcontrol"); +printVerbose("$p\::init list-events\n"); +while (!eof(SETUP)) { + $_ = <SETUP>; +} +close(SETUP); + +# Read the arch and CPU type +open (CPUTYPE, "/proc/sys/dev/oprofile/cpu_type") || + open (CPUTYPE, "/dev/oprofile/cpu_type") || + die("Failed to open cpu_type oprofile device"); +($arch, $cputype) = split(/\//, <CPUTYPE>); +close CPUTYPE; +printVerbose("$p\::arch = $arch\n"); +printVerbose("$p\::cputype = $cputype\n"); +printVerbose("$p\::event = $opt_event\n"); + +# Lookup the event for the processor +$oprofile_event = $map_event_name{"$arch##$cputype##$opt_event"}; +printVerbose("$p\::lookup $arch##$cputype##$opt_event = $oprofile_event\n"); +if ($oprofile_event eq "") { + $oprofile_event = $map_event_name{"$arch##$opt_event"}; + printVerbose("$p\:: lookup $arch##$opt_event = $oprofile_event\n"); +} + +# If unknown, exit with failure +if ($oprofile_event eq "") { + print "UNKNOWN_EVENT\n"; + exit(-2); +} + +# Apply the sampling factor if specified +if ($opt_cycle_factor != 1 || $opt_event_factor != 1) { + my ($event, $sample, $mask) = split(/:/, $oprofile_event); + + if ($opt_event =~ /^timer[0-9]*/) { + $sample *= $opt_cycle_factor; + } else { + $sample *= $opt_event_factor; + } + if ($mask eq "") { + $oprofile_event = "$event:$sample"; + } else { + $oprofile_event = "$event:$sample:$mask"; + } +} + +# Verify opcontrol agrees +open (VERIFY, "opcontrol --list-events|") || die("Failed to exec opcontrol"); +my ($oprofile_event_name) = split(/:/, $oprofile_event); +printVerbose("$p\::checking $oprofile_event_name\n"); +while (!eof(VERIFY)) { + if (<VERIFY> =~ /^$oprofile_event_name:/) { + close(VERIFY); + print "$oprofile_event\n"; + exit(0); + } +} +close(VERIFY); +printVerbose("$p\::opcontrol --list-events disagrees\n"); +print "UNKNOWN_OPROFILE_DISPARITY\n"; +exit(-3); diff --git a/default/libhugetlbfs/libhugetlbfs/oprofile_start.sh b/default/libhugetlbfs/libhugetlbfs/oprofile_start.sh new file mode 100755 index 0000000..9c2d95d --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/oprofile_start.sh @@ -0,0 +1,85 @@ +#!/bin/bash +# Script to start oprofile + +usage() { + echo "oprofile_start.sh (c) Mel Gorman 2008" + echo This script starts the oprofile daemon + echo + echo "Usage: oprofile_start.sh [options]" + echo " --event High-level oprofile event to track" + echo " --vmlinux Path to vmlinux" + echo " --sample-cycle-factor Factor which to slow down CPU cycle sampling by" + echo " --sample-event-factor Factor which to slow down event sampling by" + echo " --systemmap Guess" + echo " -h, --help Print this help message" + echo + exit +} + +# Parse command-line arguements +SCRIPTROOT=`echo $0 | sed -e 's/oprofile_start.sh$//' | sed -e 's/^\.\///'` +EVENT=default +VMLINUX=/boot/vmlinux-`uname -r` +SYSTEMMAP=/boot/System.map-`uname -r` +FACTOR= +export PATH=$SCRIPTROOT:$PATH +ARGS=`getopt -o h --long help,event:,vmlinux:,systemmap:,sample-event-factor:,sample-cycle-factor: -n oprofile_start.sh -- "$@"` + +# Cycle through arguements +eval set -- "$ARGS" +while true ; do + case "$1" in + --event) EVENTS="$EVENTS $2"; shift 2;; + --vmlinux) VMLINUX=$2; shift 2;; + --sample-cycle-factor) CYCLE_FACTOR="--sample-cycle-factor $2"; shift 2;; + --sample-event-factor) EVENT_FACTOR="--sample-event-factor $2"; shift 2;; + --systemmap) SYSTEMMAP=$2; shift 2;; + -h|--help) usage;; + *) shift 1; break;; + esac +done + +# Map the events +for EVENT in $EVENTS; do + LOWLEVEL_EVENT="$LOWLEVEL_EVENT --event `oprofile_map_events.pl $EVENT_FACTOR $CYCLE_FACTOR --event $EVENT`" + if [ $? -ne 0 ]; then + echo Failed to map event $EVENT to low-level oprofile event. Verbose output follows + oprofile_map_events.pl --event $EVENT --verbose + exit -1 + fi +done + +# Check vmlinux file exists +if [ "$VMLINUX" = "" -o ! -e $VMLINUX ]; then + echo vmlinux file \"$VMLINUX\" does not exist + exit -1 +fi + +echo Stage 1: Shutting down if running and resetting +bash opcontrol --reset +bash opcontrol --stop +bash opcontrol --reset +bash opcontrol --deinit +echo + +# Setup the profiler +echo Stage 2: Setting up oprofile +echo High-level event: $EVENTS +echo Low-level event: `echo $LOWLEVEL_EVENT | sed -e 's/--event //'` +echo vmlinux: $VMLINUX +echo opcontrol --setup $LOWLEVEL_EVENT --vmlinux=$VMLINUX +bash opcontrol --setup $LOWLEVEL_EVENT --vmlinux=$VMLINUX +if [ $? -ne 0 ]; then + echo opcontrol --setup returned failed + exit -1 +fi + +# Start the profiler +echo Stage 3: Starting profiler +bash opcontrol --start +if [ $? -ne 0 ]; then + echo opcontrol --start returned failure + exit -1 +fi + +exit 0 diff --git a/default/libhugetlbfs/libhugetlbfs/pagesize.c b/default/libhugetlbfs/libhugetlbfs/pagesize.c new file mode 100644 index 0000000..659eb27 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/pagesize.c @@ -0,0 +1,140 @@ +/*************************************************************************** + * User front end for using huge pages Copyright (C) 2008, IBM * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the Lesser GNU General Public License as * + * published by the Free Software Foundation; either version 2.1 of the * + * License, or at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the Lesser GNU General Public * + * License along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + +/* + * pagesize exposes the available and hardware supported page sizes on + * the system. + * + * This program should be treated as an ABI for using libhugetlbfs. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <errno.h> +#include <string.h> +#include <limits.h> + +#define _GNU_SOURCE /* for getopt_long */ +#include <unistd.h> +#include <getopt.h> + +#define REPORT_UTIL "pagesize" +#include "libhugetlbfs_internal.h" +#include "hugetlbfs.h" + +extern int errno; +extern int optind; +extern char *optarg; + +#define OPTION(opts, text) fprintf(stderr, " %-25s %s\n", opts, text) +#define CONT(text) fprintf(stderr, " %-25s %s\n", "", text) + +void print_usage() +{ + fprintf(stderr, "pagesize [options] target\n"); + fprintf(stderr, "options:\n"); + + OPTION("--help, -h", "Prints this message"); + + OPTION("--all, -a", "show all supported page sizes"); + OPTION("--huge-only, -H", "show only huge page sizes"); +} + +static int cmpsizes(const void *p1, const void *p2) +{ + return *((long *)p1) > *((long *)p2); +} + +#define MAX_PAGESIZES 32 + +int main(int argc, char** argv) +{ + int opt_all = 0; + int opt_huge = 0; + + char opts[] = "+haH"; + int ret = 0, index = 0; + struct option long_opts[] = { + {"all", no_argument, NULL, 'a'}, + {"huge-only", no_argument, NULL, 'H'}, + + {0}, + }; + + long pagesizes[MAX_PAGESIZES]; + int i; + + hugetlbfs_setup_debug(); + + while (ret != -1) { + ret = getopt_long(argc, argv, opts, long_opts, &index); + switch (ret) { + case '?': + print_usage(); + exit(EXIT_FAILURE); + + case 'h': + print_usage(); + exit(EXIT_SUCCESS); + + case 'a': + opt_all = 1; + INFO("selecting all page sizes\n"); + break; + + case 'H': + opt_huge = 1; + opt_all = 1; + INFO("selecting only huge page sizes\n"); + break; + + case -1: + break; + + default: + WARNING("unparsed option %08x\n", ret); + ret = -1; + break; + } + } + index = optind; + if ((argc - index) != 0) { + print_usage(); + exit(EXIT_FAILURE); + } + + if (!opt_all) { + pagesizes[0] = sysconf(_SC_PAGESIZE); + ret = 1; + } else if (opt_huge) + ret = gethugepagesizes(pagesizes, MAX_PAGESIZES); + else + ret = getpagesizes(pagesizes, MAX_PAGESIZES); + if (ret < 0) { + ERROR("failed to get list of supported page sizes\n"); + exit(EXIT_FAILURE); + } + + qsort(pagesizes, ret, sizeof(long), cmpsizes); + for (i = 0; i < ret; i++) { + printf("%ld\n", pagesizes[i]); + } + + exit(EXIT_SUCCESS); +} diff --git a/default/libhugetlbfs/libhugetlbfs/privutils.lds b/default/libhugetlbfs/libhugetlbfs/privutils.lds new file mode 100644 index 0000000..5d481e2 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/privutils.lds @@ -0,0 +1,6 @@ +VERS_1.0 { + global: + __pu_*; + local: + *; +}; diff --git a/default/libhugetlbfs/libhugetlbfs/shm.c b/default/libhugetlbfs/libhugetlbfs/shm.c new file mode 100644 index 0000000..1f82cab --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/shm.c @@ -0,0 +1,143 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#define _GNU_SOURCE +#include <dlfcn.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/ipc.h> +#include <sys/shm.h> +#include <sys/types.h> +#include "libhugetlbfs_internal.h" +#include "hugetlbfs.h" +#include <sys/syscall.h> + +#if defined(SYS_shmget) || defined(SYS_ipc) +#define HAVE_SHMGET_SYSCALL +#endif + +#ifdef HAVE_SHMGET_SYSCALL +/* + * The calls to dlsym() and dlerror() in the shmget() wrapper below force + * a dependency on libdl.so. This does not work for static executables + * as the glibc dynamic library implementation does not automatically + * have static dl* function stubs linked into static executables. + * + * Work around this problem by adding a weak attribute to the declarations + * of dlsym() and dlerror(). (The declaration is otherwise the same as in + * <dlfcn.h>). This allows a static executable to be linked without -ldl. + * If &dlsym is NULL then this is a static executable and a call to the + * system shmget() may be performed without worry as there is no dynamic + * call chain. + */ +extern void *dlsym (void *__restrict __handle, __const char *__restrict __name) + __attribute__((weak)) __THROW __nonnull ((2)); +extern char *dlerror (void) __attribute__((weak)) __THROW; + + +/* call syscall shmget through the generic syscall mechanism */ +static int syscall_shmget(key_t key, size_t size, int shmflg) +{ +#ifdef SYS_shmget + return syscall(SYS_shmget, key, size, shmflg); +#else + /* + * Some platforms do not have have a direct shmget syscall. Instead, + * all SysV IPC calls are funneled through the ipc() system call. + * + * ipc() is expected to only be used by libc implementors, so using + * it has not been smoothed out. There is no function declaration. + * The needed define for SHMGET is in linux/ipc.h, but that file + * also includes a conflicting definition of ipc_perm. So, + * just define the needed items here. + * + * When compiling -m32 on x86_64, the ipc glibc wrapper does not + * exist. Instead, just use SYS_ipc. + * + * The ipc system call below does not set the IPC_64 version flag + * with SHMGET because that would have required more private defines + * and the version number is not used for the SHMGET call. + */ + #define SHMGET 23 + + return syscall(SYS_ipc, SHMGET, key, size, shmflg, (void *)NULL, 0L); +#endif +} + +#endif /* HAVE_SHMGET_SYSCALL */ + +int shmget(key_t key, size_t size, int shmflg) +{ + static int (*real_shmget)(key_t key, size_t size, int shmflg) = NULL; + char *error; + int retval; + size_t aligned_size = size; + + DEBUG("hugetlb_shmem: entering overridden shmget() call\n"); + + /* Get a handle to the "real" shmget system call */ + if (!real_shmget) { +#ifdef HAVE_SHMGET_SYSCALL + if (&dlsym == NULL) { + /* in a static executable, call shmget directly */ + real_shmget = syscall_shmget; + } else +#endif /* HAVE_SHMGET_SYSCALL */ + { + real_shmget = dlsym(RTLD_NEXT, "shmget"); + if ((error = dlerror()) != NULL) { + ERROR("%s", error); + return -1; + } + } + } + + /* Align the size and set SHM_HUGETLB on request */ + if (__hugetlb_opts.shm_enabled) { + /* + * Use /proc/meminfo because shm always uses the system + * default huge page size. + */ + long hpage_size = kernel_default_hugepage_size(); + aligned_size = ALIGN(size, hpage_size); + if (size != aligned_size) { + DEBUG("hugetlb_shmem: size growth align %zd -> %zd\n", + size, aligned_size); + } + + INFO("hugetlb_shmem: Adding SHM_HUGETLB flag\n"); + shmflg |= SHM_HUGETLB; + } else { + DEBUG("hugetlb_shmem: shmget override not requested\n"); + } + + /* Call the "real" shmget. If hugepages fail, use small pages */ + retval = real_shmget(key, aligned_size, shmflg); + if (retval == -1 && __hugetlb_opts.shm_enabled) { + WARNING("While overriding shmget(%zd) to add SHM_HUGETLB: %s\n", + aligned_size, strerror(errno)); + shmflg &= ~SHM_HUGETLB; + retval = real_shmget(key, size, shmflg); + WARNING("Using small pages for shmget despite HUGETLB_SHM\n"); + } + + return retval; +} diff --git a/default/libhugetlbfs/libhugetlbfs/sys-elf32ppclinux.S b/default/libhugetlbfs/libhugetlbfs/sys-elf32ppclinux.S new file mode 100644 index 0000000..65d8b3f --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/sys-elf32ppclinux.S @@ -0,0 +1,34 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2007 David Gibson, IBM Corporation. + * + * Based on code from the GNU C Library, Copyright Free Software Foundation, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + + .text + + .globl direct_syscall +direct_syscall: + mr 0,3 + mr 3,4 + mr 4,5 + mr 5,6 + mr 6,7 + mr 7,8 + mr 8,9 + sc + blr diff --git a/default/libhugetlbfs/libhugetlbfs/sys-elf64ppc.S b/default/libhugetlbfs/libhugetlbfs/sys-elf64ppc.S new file mode 100644 index 0000000..1b63ff0 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/sys-elf64ppc.S @@ -0,0 +1,43 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2007 David Gibson, IBM Corporation. + * + * Based on code from the GNU C Library, Copyright Free Software Foundation, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + + .text + + .align 2 + .globl direct_syscall + .globl .direct_syscall + .section ".opd","aw" +direct_syscall: + .quad .direct_syscall + .quad .TOC.@tocbase + .quad 0 + .previous + .type .direct_syscall,@function +.direct_syscall: + mr 0,3 + mr 3,4 + mr 4,5 + mr 5,6 + mr 6,7 + mr 7,8 + mr 8,9 + sc + blr diff --git a/default/libhugetlbfs/libhugetlbfs/sys-elf_i386.S b/default/libhugetlbfs/libhugetlbfs/sys-elf_i386.S new file mode 100644 index 0000000..ab30c8d --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/sys-elf_i386.S @@ -0,0 +1,42 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2007 David Gibson, IBM Corporation. + * + * Based on code from the GNU C Library, Copyright Free Software Foundation, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + + .text + + .globl direct_syscall +direct_syscall: + push %ebp + push %edi + push %esi + push %ebx + mov 0x2c(%esp),%ebp + mov 0x28(%esp),%edi + mov 0x24(%esp),%esi + mov 0x20(%esp),%edx + mov 0x1c(%esp),%ecx + mov 0x18(%esp),%ebx + mov 0x14(%esp),%eax + int $0x80 + pop %ebx + pop %esi + pop %edi + pop %ebp + ret diff --git a/default/libhugetlbfs/libhugetlbfs/sys-elf_x86_64.S b/default/libhugetlbfs/libhugetlbfs/sys-elf_x86_64.S new file mode 100644 index 0000000..6af06ad --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/sys-elf_x86_64.S @@ -0,0 +1,34 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2007 David Gibson, IBM Corporation. + * + * Based on code from the GNU C Library, Copyright Free Software Foundation, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + + .text + + .globl direct_syscall +direct_syscall: + mov %rdi,%rax + mov %rsi,%rdi + mov %rdx,%rsi + mov %rcx,%rdx + mov %r8,%r10 + mov %r9,%r8 + mov 0x8(%rsp),%r9 + syscall + retq diff --git a/default/libhugetlbfs/libhugetlbfs/tests/.gitignore b/default/libhugetlbfs/libhugetlbfs/tests/.gitignore new file mode 100644 index 0000000..08e022a --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/.gitignore @@ -0,0 +1,4 @@ +gethugepagesize +test_root_hugetlbfs +find_path +tempfile diff --git a/default/libhugetlbfs/libhugetlbfs/tests/Makefile b/default/libhugetlbfs/libhugetlbfs/tests/Makefile new file mode 100644 index 0000000..cbf13ad --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/Makefile @@ -0,0 +1,284 @@ +PREFIX = /usr/local + +LIB_TESTS = gethugepagesize test_root find_path unlinked_fd misalign \ + readback truncate shared private fork-cow empty_mounts large_mounts \ + meminfo_nohuge ptrace-write-hugepage icache-hygiene slbpacaflush \ + chunk-overcommit mprotect alloc-instantiate-race mlock \ + truncate_reserve_wraparound truncate_sigbus_versus_oom \ + map_high_truncate_2 truncate_above_4GB direct \ + misaligned_offset brk_near_huge task-size-overrun stack_grow_into_huge \ + counters quota heap-overflow get_huge_pages get_hugepage_region \ + shmoverride_linked gethugepagesizes \ + madvise_reserve fadvise_reserve readahead_reserve \ + shm-perms \ + mremap-expand-slice-collision \ + mremap-fixed-normal-near-huge mremap-fixed-huge-near-normal +LIB_TESTS_64 = straddle_4GB huge_at_4GB_normal_below \ + huge_below_4GB_normal_above +NOLIB_TESTS = malloc malloc_manysmall dummy heapshrink shmoverride_unlinked +LDSCRIPT_TESTS = zero_filesize_segment +HUGELINK_TESTS = linkhuge linkhuge_nofd linkshare +HUGELINK_RW_TESTS = linkhuge_rw +STRESS_TESTS = mmap-gettest mmap-cow shm-gettest shm-getraw shm-fork +# NOTE: all named tests in WRAPPERS must also be named in TESTS +WRAPPERS = quota counters madvise_reserve fadvise_reserve \ + readahead_reserve mremap-expand-slice-collision \ + mremap-fixed-normal-near-huge mremap-fixed-huge-near-normal +HELPERS = get_hugetlbfs_path compare_kvers +HELPER_LIBS = libheapshrink.so +BADTOOLCHAIN = bad-toolchain.sh + +CFLAGS = -O2 -Wall -g +CPPFLAGS = -I.. +STATIC_LIBHUGE = -Wl,--whole-archive -lhugetlbfs -Wl,--no-whole-archive +STATIC_LDLIBS = -Wl,--no-as-needed -lpthread +LDLIBS = $(STATIC_LDLIBS) -ldl -lhugetlbfs_privutils +LDFLAGS32 = -L../obj32 +LDFLAGS64 = -L../obj64 +INSTALL = install + +TESTS = $(LIB_TESTS) $(NOLIB_TESTS) $(STRESS_TESTS) dummy.ldscript +ifdef ELF32 +TESTS += $(LDSCRIPT_TESTS) $(HUGELINK_TESTS) $(HUGELINK_TESTS:%=xB.%) \ + $(HUGELINK_TESTS:%=xBDT.%) $(HUGELINK_RW_TESTS) +else +ifdef ELF64 +TESTS += $(LDSCRIPT_TESTS) $(HUGELINK_TESTS) $(HUGELINK_TESTS:%=xB.%) \ + $(HUGELINK_TESTS:%=xBDT.%) +endif +endif + +ifneq ($(ARCH),ia64) +TESTS_64 = $(LIB_TESTS_64) +endif + +SCRIPTS=../ldscripts +SCRIPTS32 = $(SCRIPTS)/$(ELF32) +SCRIPTS64 = $(SCRIPTS)/$(ELF64) +HUGETLBFS_LD=../ld.hugetlbfs +INST_TESTSDIR32 = $(LIBDIR32)/libhugetlbfs/tests +INST_TESTSDIR64 = $(LIBDIR64)/libhugetlbfs/tests + +ifdef V +VECHO = : +else +VECHO = echo " " +.SILENT: +endif + +DEPFILES = $(LIB_TESTS:%=%.d) $(NOLIB_TESTS:%=%.d) $(HUGELINK_TESTS:%=%.d) \ + $(HELPERS:%=%.d) testutils.d + +ALLTESTS = $(foreach DIR,$(OBJDIRS),$(TESTS:%=$(DIR)/%)) +ALLHELPERS = $(foreach DIR,$(OBJDIRS),$(HELPERS:%=$(DIR)/%)) +ALLHELPERLIBS = $(foreach DIR,$(OBJDIRS),$(HELPER_LIBS:%=$(DIR)/%)) +ifdef CC64 +ALLTESTS += $(TESTS_64:%=obj64/%) +endif + +# For now, build only one test as a static binary. +# Can be changed once libhugetlbfs has better support for static linking. +# Also, some tests should be changed to use syscall() instead of +# dlsym() / rtld_next(). +ifdef CC32 +#ALLTESTS += $(LIB_TESTS:%=obj32/%_static) $(STRESS_TESTS:%=obj32/%_static) +#ALLTESTS += obj32/shmoverride_linked_static +endif +ifdef CC64 +#ALLTESTS += $(LIB_TESTS:%=obj64/%_static) $(STRESS_TESTS:%=obj64/%_static) +#ALLTESTS += obj64/shmoverride_linked_static +endif + +objs_needing_wrappers = \ + $(foreach W,$(WRAPPERS:%.sh=%),$(filter $(1)/$(W),$(ALLTESTS))) +WRAPPERS32 = $(addsuffix .sh,$(call objs_needing_wrappers,obj32)) +WRAPPERS64 = $(addsuffix .sh,$(call objs_needing_wrappers,obj64)) +ALLWRAPPERS = $(WRAPPERS32) $(WRAPPERS64) + +all: $(ALLTESTS) $(ALLHELPERS) $(ALLHELPERLIBS) $(ALLWRAPPERS) + +shmoverride_linked.c: shmoverride_unlinked.c + ln -s shmoverride_unlinked.c shmoverride_linked.c + +obj32/%.o: %.c + @$(VECHO) CC32 $@ + @mkdir -p obj32 + $(CC32) $(CPPFLAGS) $(CFLAGS) -o $@ -c $< + +obj64/%.o: %.c + @$(VECHO) CC64 $@ + @mkdir -p obj64 + $(CC64) $(CPPFLAGS) $(CFLAGS) -o $@ -c $< + +obj32/%-pic.o: %.c + @$(VECHO) CC32 $@ + @mkdir -p obj32 + $(CC32) $(CPPFLAGS) $(CFLAGS) -fPIC -o $@ -c $< + +obj64/%-pic.o: %.c + @$(VECHO) CC64 $@ + @mkdir -p obj64 + $(CC64) $(CPPFLAGS) $(CFLAGS) -fPIC -o $@ -c $< + +obj32/libheapshrink.so: obj32/heapshrink-helper-pic.o + @$(VECHO) LD32 "(shared)" $@ + @mkdir -p obj32 + $(CC32) -Wl,-soname,$(notdir $@) -shared -o $@ $^ + +obj64/libheapshrink.so: obj64/heapshrink-helper-pic.o + @$(VECHO) LD64 "(shared)" $@ + @mkdir -p obj64 + $(CC64) -Wl,-soname,$(notdir $@) -shared -o $@ $^ + +$(LIB_TESTS:%=obj32/%): %: %.o obj32/testutils.o obj32/libtestutils.o + @$(VECHO) LD32 "(lib test)" $@ + $(CC32) $(LDFLAGS) $(LDFLAGS32) -o $@ $^ $(LDLIBS) -lhugetlbfs + +$(LIB_TESTS:%=obj64/%) $(LIB_TESTS_64:%=obj64/%): %: %.o obj64/testutils.o obj64/libtestutils.o + @$(VECHO) LD64 "(lib test)" $@ + $(CC64) $(LDFLAGS) $(LDFLAGS64) -o $@ $^ $(LDLIBS) -lhugetlbfs + +$(LIB_TESTS:%=obj32/%_static): %_static: %.o obj32/testutils.o obj32/libtestutils.o + @$(VECHO) LD32 "(lib test)" $@ + $(CC32) -static $(LDFLAGS) $(LDFLAGS32) -o $@ $^ $(STATIC_LDLIBS) $(STATIC_LIBHUGE) + +$(LIB_TESTS:%=obj64/%_static) $(LIB_TESTS_64:%=obj64/%_static): %_static: %.o obj64/testutils.o obj64/libtestutils.o + @$(VECHO) LD64 "(lib test)" $@ + $(CC64) -static $(LDFLAGS) $(LDFLAGS64) -o $@ $^ $(STATIC_LDLIBS) $(STATIC_LIBHUGE) + +$(NOLIB_TESTS:%=obj32/%): %: %.o obj32/testutils.o + @$(VECHO) LD32 "(nolib test)" $@ + $(CC32) $(LDFLAGS) $(LDFLAGS32) -o $@ $^ $(LDLIBS) + +$(NOLIB_TESTS:%=obj64/%): %: %.o obj64/testutils.o + @$(VECHO) LD64 "(nolib test)" $@ + $(CC64) $(LDFLAGS) $(LDFLAGS64) -o $@ $^ $(LDLIBS) + +obj32/%.ldscript: obj32/%.o obj32/testutils.o + @$(VECHO) SCRIPT32 $@ + $(CC32) $(LDFLAGS) $(LDFLAGS32) -Wl,--verbose -o/dev/null $^ $(LDLIBS) > $@ + +obj64/%.ldscript: obj64/%.o obj64/testutils.o + @$(VECHO) SCRIPT64 $@ + $(CC64) $(LDFLAGS) $(LDFLAGS64) -Wl,--verbose -o/dev/null $^ $(LDLIBS) > $@ + +$(LDSCRIPT_TESTS:%=obj32/%): obj32/%: %.ld obj32/%.o obj32/testutils.o + @$(VECHO) LD32 "(preload test)" $@ + $(CC32) $(LDFLAGS) $(LDFLAGS32) -o $@ -Lobj32 $^ $(LDLIBS) || cp $(BADTOOLCHAIN) $@ + +$(LDSCRIPT_TESTS:%=obj64/%): obj64/%: %.ld obj64/%.o obj64/testutils.o + @$(VECHO) LD64 "(preload test)" $@ + $(CC64) $(LDFLAGS) $(LDFLAGS64) -o $@ -Lobj64 $^ $(LDLIBS) || cp $(BADTOOLCHAIN) $@ + +$(HUGELINK_TESTS:%=obj32/%): %: %.o obj32/testutils.o + @$(VECHO) LD32 "(hugelink test)" $@ + $(CC32) $(LDFLAGS) $(LDFLAGS32) -o $@ $^ $(LDLIBS) + +$(HUGELINK_TESTS:%=obj64/%): %: %.o obj64/testutils.o + @$(VECHO) LD64 "(hugelink test)" $@ + $(CC64) $(LDFLAGS) $(LDFLAGS64) -o $@ $^ $(LDLIBS) + +$(HUGELINK_RW_TESTS:%=obj32/%): %: %.o $(HUGETLBFS_LD) obj32/testutils.o + @$(VECHO) LD32 "(hugelink_rw test)" $@ + @mkdir -p obj32 + @ln -sf ../$(HUGETLBFS_LD) obj32/ld + $(CC32) -B./obj32 $(LDFLAGS) $(LDFLAGS32) -o $@ $(LDLIBS) -Wl,--hugetlbfs-align $(filter %.o,$^) + +$(HUGELINK_RW_TESTS:%=obj64/%): %: %.o $(HUGETLBFS_LD) obj64/testutils.o + @$(VECHO) LD64 "(hugelink_rw test)" $@ + @mkdir -p obj64 + @ln -sf ../$(HUGETLBFS_LD) obj64/ld + $(CC64) -B./obj64 $(LDFLAGS) $(LDFLAGS64) -o $@ $(LDLIBS) -Wl,--hugetlbfs-align $(filter %.o,$^) + +$(STRESS_TESTS:%=obj32/%): %: %.o obj32/testutils.o + @$(VECHO) LD32 "(lib test)" $@ + $(CC32) $(LDFLAGS) $(LDFLAGS32) -o $@ $^ $(LDLIBS) -lhugetlbfs + +$(STRESS_TESTS:%=obj64/%): %: %.o obj64/testutils.o + @$(VECHO) LD64 "(lib test)" $@ + $(CC64) $(LDFLAGS) $(LDFLAGS64) -o $@ $^ $(LDLIBS) -lhugetlbfs + +$(STRESS_TESTS:%=obj32/%_static): %_static: %.o obj32/testutils.o + @$(VECHO) LD32 "(lib test)" $@ + $(CC32) -static $(LDFLAGS) $(LDFLAGS32) -o $@ $^ $(STATIC_LDLIBS) $(STATIC_LIBHUGE) + +$(STRESS_TESTS:%=obj64/%_static): %_static: %.o obj64/testutils.o + @$(VECHO) LD64 "(lib test)" $@ + $(CC64) -static $(LDFLAGS) $(LDFLAGS64) -o $@ $^ $(STATIC_LDLIBS) $(STATIC_LIBHUGE) + +obj32/xB.%: $(SCRIPTS32).xB $(HUGETLBFS_LD) obj32/%.o obj32/testutils.o + @$(VECHO) LD32 "(xB test)" $@ + @mkdir -p obj32 + @ln -sf ../$(HUGETLBFS_LD) obj32/ld + HUGETLB_DEPRECATED_LINK=1 $(CC32) -B./obj32 $(LDFLAGS) $(LDFLAGS32) -o $@ $(LDLIBS) -Wl,--hugetlbfs-link=B $(filter %.o,$^) + +obj64/xB.%: $(SCRIPTS64).xB $(HUGETLBFS_LD) obj64/%.o obj64/testutils.o + @$(VECHO) LD64 "(xB test)" $@ + @mkdir -p obj64 + @ln -sf ../$(HUGETLBFS_LD) obj64/ld + HUGETLB_DEPRECATED_LINK=1 $(CC64) -B./obj64 $(LDFLAGS) $(LDFLAGS64) -o $@ $(LDLIBS) -Wl,--hugetlbfs-link=B $(filter %.o,$^) + +obj32/xBDT.%: $(SCRIPTS32).xBDT $(HUGETLBFS_LD) obj32/%.o obj32/testutils.o + @$(VECHO) LD32 "(xBDT test)" $@ + @mkdir -p obj32 + @ln -sf ../$(HUGETLBFS_LD) obj32/ld + HUGETLB_DEPRECATED_LINK=1 $(CC32) -B./obj32 $(LDFLAGS) $(LDFLAGS32) -o $@ $(LDLIBS) -Wl,--hugetlbfs-link=BDT $(filter %.o,$^) + +obj64/xBDT.%: $(SCRIPTS64).xBDT $(HUGETLBFS_LD) obj64/%.o obj64/testutils.o + @$(VECHO) LD64 "(xBDT test)" $@ + @mkdir -p obj64 + @ln -sf ../$(HUGETLBFS_LD) obj64/ld + HUGETLB_DEPRECATED_LINK=1 $(CC64) -B./obj64 $(LDFLAGS) $(LDFLAGS64) -o $@ $(LDLIBS) -Wl,--hugetlbfs-link=BDT $(filter %.o,$^) + +$(HELPERS:%=obj32/%): %: %.o + @$(VECHO) LD32 "(helper)" $@ + $(CC32) $(LDFLAGS) $(LDFLAGS32) -o $@ $^ $(LDLIBS) -lhugetlbfs + +$(HELPERS:%=obj64/%): %: %.o + @$(VECHO) LD64 "(helper)" $@ + $(CC64) $(LDFLAGS) $(LDFLAGS64) -o $@ $^ $(LDLIBS) -lhugetlbfs + +$(WRAPPERS32): obj32/%.sh: %.sh obj32/% + @$(VECHO) COPY "(wrapped test)" $@ + @cp -f $< $@ + +$(WRAPPERS64): obj64/%.sh: %.sh obj64/% + @$(VECHO) COPY "(wrapped test)" $@ + @cp -f $< $@ + +clean: + @$(VECHO) CLEAN "(tests)" + rm -f *~ *.o *.so *.a *.d core a.out + rm -rf obj* + rm -f shmoverride_linked.c # Autogenerated file + rm -f $(TESTS) + +%.d: %.c + @$(CC) $(CPPFLAGS) -MM -MT "$(foreach DIR,$(OBJDIRS),$(DIR)/$*.o) $@" $< > $@ + +-include $(DEPFILES) + +obj32/install: + @$(VECHO) INSTALL32 $(INST_TESTSDIR32) + $(INSTALL) -d $(DESTDIR)$(INST_TESTSDIR32) + $(INSTALL) -d $(DESTDIR)$(INST_TESTSDIR32)/obj32 + $(INSTALL) -m 755 $(TESTS:%=obj32/%) $(DESTDIR)$(INST_TESTSDIR32)/obj32 + $(INSTALL) -m 755 $(WRAPPERS32) $(DESTDIR)$(INST_TESTSDIR32)/obj32 + $(INSTALL) -m 755 wrapper-utils.sh $(DESTDIR)$(INST_TESTSDIR32)/obj32 + $(INSTALL) -m 755 $(HELPERS:%=obj32/%) $(DESTDIR)$(INST_TESTSDIR32)/obj32 + $(INSTALL) -m 755 $(HELPER_LIBS:%=obj32/%) $(DESTDIR)$(INST_TESTSDIR32)/obj32 + $(INSTALL) -m 755 run_tests.py $(DESTDIR)$(INST_TESTSDIR32) + +obj64/install: + @$(VECHO) INSTALL64 $(INST_TESTSDIR64) + $(INSTALL) -d $(DESTDIR)$(INST_TESTSDIR64) + $(INSTALL) -d $(DESTDIR)$(INST_TESTSDIR64)/obj64 + $(INSTALL) -m 755 $(TESTS:%=obj64/%) $(DESTDIR)$(INST_TESTSDIR64)/obj64 + $(INSTALL) -m 755 $(WRAPPERS64) $(DESTDIR)$(INST_TESTSDIR64)/obj64 + $(INSTALL) -m 755 wrapper-utils.sh $(DESTDIR)$(INST_TESTSDIR64)/obj64 + $(INSTALL) -m 755 $(HELPERS:%=obj64/%) $(DESTDIR)$(INST_TESTSDIR64)/obj64 + $(INSTALL) -m 755 $(HELPER_LIBS:%=obj64/%) $(DESTDIR)$(INST_TESTSDIR64)/obj64 + $(INSTALL) -m 755 $(TESTS_64:%=obj64/%) $(DESTDIR)$(INST_TESTSDIR64)/obj64 + $(INSTALL) -m 755 run_tests.py $(DESTDIR)$(INST_TESTSDIR64) + +install: $(OBJDIRS:%=%/install) diff --git a/default/libhugetlbfs/libhugetlbfs/tests/alloc-instantiate-race.c b/default/libhugetlbfs/libhugetlbfs/tests/alloc-instantiate-race.c new file mode 100644 index 0000000..0929924 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/alloc-instantiate-race.c @@ -0,0 +1,274 @@ +/* + * Test rationale: + * + * This test is designed to detect a kernel allocation race introduced + * with hugepage demand-faulting. The problem is that no lock is held + * between allocating a hugepage and instantiating it in the + * pagetables or page cache index. In between the two, the (huge) + * page is cleared, so there's substantial time. Thus two processes + * can race instantiating the (same) last available hugepage - one + * will fail on the allocation, and thus cause an OOM fault even + * though the page it actually wants is being instantiated by the + * other racing process. + * + * + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sched.h> +#include <signal.h> +#include <sys/wait.h> +#include <pthread.h> +#include <linux/unistd.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +pid_t gettid(void) +{ + return syscall(__NR_gettid); +} + +static long hpage_size; +static pid_t child1, child2; +static pthread_t thread1, thread2; + +void cleanup(void) +{ + if (child1) + kill(child1, SIGKILL); + if (child2) + kill(child2, SIGKILL); +} + +static int one_racer(void *p, int cpu, + volatile int *mytrigger, volatile int *othertrigger) +{ + volatile int *pi = p; + cpu_set_t cpuset; + int err; + + /* Split onto different cpus to encourage the race */ + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + + err = sched_setaffinity(gettid(), CPU_SETSIZE/8, &cpuset); + if (err != 0) + CONFIG("sched_setaffinity(cpu%d): %s", cpu, strerror(errno)); + + /* Ready.. */ + *mytrigger = 1; + /* Set.. */ + while (! *othertrigger) + ; + + /* Instantiate! */ + *pi = 1; + + return 0; +} + +static void proc_racer(void *p, int cpu, + volatile int *mytrigger, volatile int *othertrigger) +{ + exit(one_racer(p, cpu, mytrigger, othertrigger)); +} + +struct racer_info { + void *p; /* instantiation address */ + int cpu; + int race_type; + volatile int *mytrigger; + volatile int *othertrigger; + int status; +}; + +static void *thread_racer(void *info) +{ + struct racer_info *ri = info; + int rc; + + rc = one_racer(ri->p, ri->cpu, ri->mytrigger, ri->othertrigger); + return ri; +} +static void run_race(void *syncarea, int race_type) +{ + volatile int *trigger1, *trigger2; + int fd; + void *p; + int status1, status2; + int ret; + + memset(syncarea, 0, sizeof(*trigger1) + sizeof(*trigger2)); + trigger1 = syncarea; + trigger2 = trigger1 + 1; + + /* Get a new file for the final page */ + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + verbose_printf("Mapping final page.. "); + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, race_type, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + verbose_printf("%p\n", p); + + if (race_type == MAP_SHARED) { + child1 = fork(); + if (child1 < 0) + FAIL("fork(): %s", strerror(errno)); + if (child1 == 0) + proc_racer(p, 0, trigger1, trigger2); + + child2 = fork(); + if (child2 < 0) + FAIL("fork(): %s", strerror(errno)); + if (child2 == 0) + proc_racer(p, 1, trigger2, trigger1); + + /* wait() calls */ + ret = waitpid(child1, &status1, 0); + if (ret < 0) + FAIL("waitpid() child 1: %s", strerror(errno)); + verbose_printf("Child 1 status: %x\n", status1); + + + ret = waitpid(child2, &status2, 0); + if (ret < 0) + FAIL("waitpid() child 2: %s", strerror(errno)); + verbose_printf("Child 2 status: %x\n", status2); + + if (WIFSIGNALED(status1)) + FAIL("Child 1 killed by signal %s", + strsignal(WTERMSIG(status1))); + if (WIFSIGNALED(status2)) + FAIL("Child 2 killed by signal %s", + strsignal(WTERMSIG(status2))); + + status1 = WEXITSTATUS(status1); + status2 = WEXITSTATUS(status2); + } else { + struct racer_info ri1 = { + .p = p, + .cpu = 0, + .mytrigger = trigger1, + .othertrigger = trigger2, + }; + struct racer_info ri2 = { + .p = p, + .cpu = 1, + .mytrigger = trigger2, + .othertrigger = trigger1, + }; + void *tret1, *tret2; + + ret = pthread_create(&thread1, NULL, thread_racer, &ri1); + if (ret != 0) + FAIL("pthread_create() 1: %s\n", strerror(errno)); + + ret = pthread_create(&thread2, NULL, thread_racer, &ri2); + if (ret != 0) + FAIL("pthread_create() 2: %s\n", strerror(errno)); + + ret = pthread_join(thread1, &tret1); + if (ret != 0) + FAIL("pthread_join() 1: %s\n", strerror(errno)); + if (tret1 != &ri1) + FAIL("Thread 1 returned %p not %p, killed?\n", + tret1, &ri1); + ret = pthread_join(thread2, &tret2); + if (ret != 0) + FAIL("pthread_join() 2: %s\n", strerror(errno)); + if (tret2 != &ri2) + FAIL("Thread 2 returned %p not %p, killed?\n", + tret2, &ri2); + + status1 = ri1.status; + status2 = ri2.status; + } + + if (status1 != 0) + FAIL("Racer 1 terminated with code %d", status1); + + if (status2 != 0) + FAIL("Racer 2 terminated with code %d", status2); +} + +int main(int argc, char *argv[]) +{ + unsigned long totpages; + int fd; + void *p, *q; + unsigned long i; + int race_type; + + test_init(argc, argv); + + if (argc != 2) + CONFIG("Usage: alloc-instantiate-race <private|shared>"); + + hpage_size = check_hugepagesize(); + totpages = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + + if (strcmp(argv[1], "shared") == 0) { + race_type = MAP_SHARED; + } else if (strcmp(argv[1], "private") == 0) { + race_type = MAP_PRIVATE; + } else { + CONFIG("Usage: alloc-instantiate-race <private|shared>"); + } + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + /* Get a shared normal page for synchronization */ + verbose_printf("Mapping synchronization area.."); + q = mmap(NULL, getpagesize(), PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_ANONYMOUS, -1, 0); + if (q == MAP_FAILED) + FAIL("mmap() sync area: %s", strerror(errno)); + verbose_printf("done\n"); + + verbose_printf("Mapping %ld/%ld pages.. ", totpages-1, totpages); + p = mmap(NULL, (totpages-1)*hpage_size, PROT_READ|PROT_WRITE, + MAP_SHARED, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() 1: %s", strerror(errno)); + + /* Allocate all save one of the pages up front */ + verbose_printf("instantiating.. "); + for (i = 0; i < (totpages - 1); i++) + memset(p + (i * hpage_size), 0, sizeof(int)); + verbose_printf("done\n"); + + run_race(q, race_type); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/bad-toolchain.sh b/default/libhugetlbfs/libhugetlbfs/tests/bad-toolchain.sh new file mode 100755 index 0000000..2535aa0 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/bad-toolchain.sh @@ -0,0 +1,5 @@ +#! /bin/sh + +echo "Bad toolchain: can't build this testcase" + +exit 1 diff --git a/default/libhugetlbfs/libhugetlbfs/tests/brk_near_huge.c b/default/libhugetlbfs/libhugetlbfs/tests/brk_near_huge.c new file mode 100644 index 0000000..71eb803 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/brk_near_huge.c @@ -0,0 +1,114 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +/* + * Test rationale: + * + * Certain kernels have a bug where brk() does not perform the same + * checks that a MAP_FIXED mmap() will, allowing brk() to create a + * normal page VMA in a hugepage only address region. This can lead + * to oopses or other badness. + */ + +/* Possibly these functions should go in the library itself.. */ +#ifdef __powerpc64__ +void *next_chunk(void *addr) +{ + if ((unsigned long)addr < 0x100000000UL) + /* 256M segments below 4G */ + return PALIGN(addr, 0x10000000UL); + else + /* 1TB segments above */ + return PALIGN(addr, 0x10000000000UL); +} +#elif defined(__powerpc__) +void *next_chunk(void *addr) +{ + return PALIGN(addr, 0x10000000UL); +} +#elif defined(__ia64__) +void *next_chunk(void *addr) +{ + return PALIGN(addr, 0x8000000000000000UL); +} +#else +void *next_chunk(void *addr) +{ + return PALIGN(addr, gethugepagesize()); +} +#endif + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *brk0, *hugemap_addr, *newbrk; + char *p; + int err; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + brk0 = sbrk(0); + verbose_printf("Initial break at %p\n", brk0); + + hugemap_addr = next_chunk(brk0) + hpage_size; + + p = mmap(hugemap_addr, hpage_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + if (p != hugemap_addr) + FAIL("mmap() at unexpected address %p instead of %p\n", p, + hugemap_addr); + + verbose_printf("Hugepage mapped at %p-%p\n", p, p+hpage_size-1); + + err = test_addr_huge((void *)p); + if (err != 1) + FAIL("Mapped address is not hugepage"); + + newbrk = next_chunk(brk0) + getpagesize(); + err = brk((void *)newbrk); + if (err == -1) + /* Failing the brk() is an acceptable kernel response */ + PASS(); + + /* Suceeding the brk() is acceptable iff the new memory is + * properly accesible and we don't have a kernel blow up when + * we touch it. */ + memset(brk0, 0, newbrk-brk0); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/chunk-overcommit.c b/default/libhugetlbfs/libhugetlbfs/tests/chunk-overcommit.c new file mode 100644 index 0000000..e8f20e0 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/chunk-overcommit.c @@ -0,0 +1,114 @@ +/* + * Test rationale: + * + * Some kernel versions after hugepage demand allocation was added + * used a dubious heuristic to check if there was enough hugepage + * space available for a given mapping. The number of + * not-already-instantiated pages in the mapping was compared against + * the total hugepage free pool. It was very easy to confuse this + * heuristic into overcommitting by allocating hugepage memory in + * chunks, each less than the total available pool size but together + * more than available. This would generally lead to OOM SIGKILLs of + * one process or another when it tried to instantiate pages beyond + * the available pool. + * + * + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <sys/mman.h> +#include <sys/wait.h> +#include <signal.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +#define RANDOM_CONSTANT 0x1234ABCD + +int main(int argc, char *argv[]) +{ + long hpage_size; + unsigned long totpages, chunk1, chunk2; + int fd; + void *p, *q; + pid_t child, ret; + int status; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + totpages = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + chunk1 = (totpages / 2) + 1; + chunk2 = totpages - chunk1 + 1; + + verbose_printf("overcommit: %ld hugepages available: " + "chunk1=%ld chunk2=%ld\n", totpages, chunk1, chunk2); + + p = mmap(NULL, chunk1*hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() chunk1: %s", strerror(errno)); + + q = mmap(NULL, chunk2*hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, chunk1*hpage_size); + if (q == MAP_FAILED) { + if (errno != ENOMEM) + FAIL("mmap() chunk2: %s", strerror(errno)); + else + PASS(); + } + + verbose_printf("Looks like we've overcommitted, testing...\n"); + + /* Looks like we're overcommited, but we need to confirm that + * this is bad. We touch it all in a child process because an + * overcommit will generally lead to a SIGKILL which we can't + * handle, of course. */ + child = fork(); + if (child < 0) + FAIL("fork(): %s", strerror(errno)); + + if (child == 0) { + memset(p, 0, chunk1*hpage_size); + memset(q, 0, chunk2*hpage_size); + exit(0); + } + + ret = waitpid(child, &status, 0); + if (ret < 0) + FAIL("waitpid(): %s", strerror(errno)); + + if (WIFSIGNALED(status)) + FAIL("Killed by signal \"%s\" due to overcommit", + strsignal(WTERMSIG(status))); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/compare_kvers.c b/default/libhugetlbfs/libhugetlbfs/tests/compare_kvers.c new file mode 100644 index 0000000..e2ef62a --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/compare_kvers.c @@ -0,0 +1,41 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdio.h> +#include <errno.h> +#include "libhugetlbfs_privutils.h" + +int main (int argc, char **argv) +{ + if (argc != 3) { + printf("Usage: %s <version-str> <version-str>\n", argv[0]); + return -1; + } + + switch (test_compare_kver(argv[1], argv[2])) { + case 0: /* Equal to */ + return 0; + case -1: /* Less than */ + return 1; + case 1: /* Greater than */ + return 2; + default: + return -1; + } +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/counters.c b/default/libhugetlbfs/libhugetlbfs/tests/counters.c new file mode 100644 index 0000000..0284809 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/counters.c @@ -0,0 +1,414 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2007 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <sys/types.h> +#include <sys/shm.h> +#include <sys/types.h> +#include <sys/mman.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <hugetlbfs.h> +#include "hugetests.h" + +/* + * Test Rationale: + * + * The hugetlb pool maintains 4 global counters to track pages as they + * transition between various states. Due to the complex relationships between + * the counters, regressions are likely to occur in the future. This test + * performs operations that change the counters in known ways. It emulates the + * expected kernel behavior and compares the expected result to the actual + * values after each operation. + */ + +extern int errno; + +/* Global test configuration */ +#define DYNAMIC_SYSCTL "/proc/sys/vm/nr_overcommit_hugepages" +static long saved_nr_hugepages = -1; +static long saved_oc_hugepages = -1; +static long hpage_size; +static int private_resv; + +/* State arrays for our mmaps */ +#define NR_SLOTS 2 +#define SL_SETUP 0 +#define SL_TEST 1 +static int map_fd[NR_SLOTS]; +static char *map_addr[NR_SLOTS]; +static unsigned long map_size[NR_SLOTS]; +static unsigned int touched[NR_SLOTS]; + +/* Keep track of expected counter values */ +static long prev_total; +static long prev_free; +static long prev_resv; +static long prev_surp; + +#define min(a,b) (((a) < (b)) ? (a) : (b)) +#define max(a,b) (((a) > (b)) ? (a) : (b)) + +/* Restore original nr_hugepages */ +void cleanup(void) { + if (hpage_size <= 0) + return; + if (saved_nr_hugepages >= 0) + set_nr_hugepages(hpage_size, saved_nr_hugepages); + if (saved_oc_hugepages >= 0) + set_nr_overcommit_hugepages(hpage_size, saved_oc_hugepages); +} + +void verify_dynamic_pool_support(void) +{ + saved_oc_hugepages = get_huge_page_counter(hpage_size, HUGEPAGES_OC); + if (saved_oc_hugepages < 0) + FAIL("Kernel appears to lack dynamic hugetlb pool support"); + set_nr_overcommit_hugepages(hpage_size, 10); +} + +void bad_value(int line, const char *name, long expect, long actual) +{ + if (actual == -1) + ERROR("%s not found in /proc/meminfo", name); + else + FAIL("Line %i: Bad %s: expected %li, actual %li", + line, name, expect, actual); +} + +void verify_counters(int line, long et, long ef, long er, long es) +{ + long t, f, r, s; + + t = get_huge_page_counter(hpage_size, HUGEPAGES_TOTAL); + f = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + r = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + s = get_huge_page_counter(hpage_size, HUGEPAGES_SURP); + + /* Invariant checks */ + if (t < 0 || f < 0 || r < 0 || s < 0) + ERROR("Negative counter value"); + if (f < r) + ERROR("HugePages_Free < HugePages_Rsvd"); + + /* Check actual values against expected values */ + if (t != et) + bad_value(line, "HugePages_Total", et, t); + + if (f != ef) + bad_value(line, "HugePages_Free", ef, f); + + if (r != er) + bad_value(line, "HugePages_Rsvd", er, r); + + if (s != es) + bad_value(line, "HugePages_Surp", es, s); + + /* Everything's good. Update counters */ + prev_total = t; + prev_free = f; + prev_resv = r; + prev_surp = s; +} + +/* Memory operations: + * Each of these has a predefined effect on the counters + */ +#define persistent_huge_pages (et - es) +void _set_nr_hugepages(unsigned long count, int line) +{ + long min_size; + long et, ef, er, es; + + if (set_nr_hugepages(hpage_size, count)) + FAIL("Cannot set nr_hugepages"); + + /* The code below is based on set_max_huge_pages in mm/hugetlb.c */ + es = prev_surp; + et = prev_total; + ef = prev_free; + er = prev_resv; + + /* + * Increase the pool size + * First take pages out of surplus state. Then make up the + * remaining difference by allocating fresh huge pages. + */ + while (es && count > persistent_huge_pages) + es--; + while (count > persistent_huge_pages) { + et++; + ef++; + } + if (count >= persistent_huge_pages) + goto out; + + /* + * Decrease the pool size + * First return free pages to the buddy allocator (being careful + * to keep enough around to satisfy reservations). Then place + * pages into surplus state as needed so the pool will shrink + * to the desired size as pages become free. + */ + min_size = max(count, er + et - ef); + while (min_size < persistent_huge_pages) { + ef--; + et--; + } + while (count < persistent_huge_pages) { + es++; + } + +out: + verify_counters(line, et, ef, er, es); +} +#undef set_nr_hugepages +#define set_nr_hugepages(c) _set_nr_hugepages(c, __LINE__) + +void _map(int s, int hpages, int flags, int line) +{ + long et, ef, er, es; + + map_fd[s] = hugetlbfs_unlinked_fd(); + if (map_fd[s] < 0) + CONFIG("Unable to open hugetlbfs file: %s", strerror(errno)); + map_size[s] = hpages * hpage_size; + map_addr[s] = mmap(NULL, map_size[s], PROT_READ|PROT_WRITE, flags, + map_fd[s], 0); + if (map_addr[s] == MAP_FAILED) + FAIL("mmap failed: %s", strerror(errno)); + touched[s] = 0; + + et = prev_total; + ef = prev_free; + er = prev_resv; + es = prev_surp; + + /* + * When using MAP_SHARED, a reservation will be created to guarantee + * pages to the process. If not enough pages are available to + * satisfy the reservation, surplus pages are added to the pool. + * NOTE: This code assumes that the whole mapping needs to be + * reserved and hence, will not work with partial reservations. + * + * If the kernel supports private reservations, then MAP_PRIVATE + * mappings behave like MAP_SHARED at mmap time. Otherwise, + * no counter updates will occur. + */ + if ((flags & MAP_SHARED) || private_resv) { + unsigned long shortfall = 0; + if (hpages + prev_resv > prev_free) + shortfall = hpages - prev_free + prev_resv; + et += shortfall; + ef = prev_free + shortfall; + er = prev_resv + hpages; + es = prev_surp + shortfall; + } + + verify_counters(line, et, ef, er, es); +} +#define map(s, h, f) _map(s, h, f, __LINE__) + +void _unmap(int s, int hpages, int flags, int line) +{ + long et, ef, er, es; + unsigned long i; + + munmap(map_addr[s], map_size[s]); + close(map_fd[s]); + map_fd[s] = -1; + map_addr[s] = NULL; + map_size[s] = 0; + + et = prev_total; + ef = prev_free; + er = prev_resv; + es = prev_surp; + + /* + * When a VMA is unmapped, the instantiated (touched) pages are + * freed. If the pool is in a surplus state, pages are freed to the + * buddy allocator, otherwise they go back into the hugetlb pool. + * NOTE: This code assumes touched pages have only one user. + */ + for (i = 0; i < touched[s]; i++) { + if (es) { + et--; + es--; + } else + ef++; + } + + /* + * mmap may have created some surplus pages to accomodate a + * reservation. If those pages were not touched, then they will + * not have been freed by the code above. Free them here. + */ + if ((flags & MAP_SHARED) || private_resv) { + int unused_surplus = min(hpages - touched[s], es); + et -= unused_surplus; + ef -= unused_surplus; + er -= hpages - touched[s]; + es -= unused_surplus; + } + + verify_counters(line, et, ef, er, es); +} +#define unmap(s, h, f) _unmap(s, h, f, __LINE__) + +void _touch(int s, int hpages, int flags, int line) +{ + long et, ef, er, es; + int nr; + char *c; + + for (c = map_addr[s], nr = hpages; + hpages && c < map_addr[s] + map_size[s]; + c += hpage_size, nr--) + *c = (char) (nr % 2); + /* + * Keep track of how many pages were touched since we can't easily + * detect that from user space. + * NOTE: Calling this function more than once for a mmap may yield + * results you don't expect. Be careful :) + */ + touched[s] = max(touched[s], hpages); + + /* + * Shared (and private when supported) mappings and consume resv pages + * that were previously allocated. Also deduct them from the free count. + * + * Unreserved private mappings may need to allocate surplus pages to + * satisfy the fault. The surplus pages become part of the pool + * which could elevate total, free, and surplus counts. resv is + * unchanged but free must be decreased. + */ + if (flags & MAP_SHARED || private_resv) { + et = prev_total; + ef = prev_free - hpages; + er = prev_resv - hpages; + es = prev_surp; + } else { + if (hpages + prev_resv > prev_free) + et = prev_total + (hpages - prev_free + prev_resv); + else + et = prev_total; + er = prev_resv; + es = prev_surp + et - prev_total; + ef = prev_free - hpages + et - prev_total; + } + verify_counters(line, et, ef, er, es); +} +#define touch(s, h, f) _touch(s, h, f, __LINE__) + +void run_test(char *desc, int base_nr) +{ + verbose_printf("%s...\n", desc); + set_nr_hugepages(base_nr); + + /* untouched, shared mmap */ + map(SL_TEST, 1, MAP_SHARED); + unmap(SL_TEST, 1, MAP_SHARED); + + /* untouched, private mmap */ + map(SL_TEST, 1, MAP_PRIVATE); + unmap(SL_TEST, 1, MAP_PRIVATE); + + /* touched, shared mmap */ + map(SL_TEST, 1, MAP_SHARED); + touch(SL_TEST, 1, MAP_SHARED); + unmap(SL_TEST, 1, MAP_SHARED); + + /* touched, private mmap */ + map(SL_TEST, 1, MAP_PRIVATE); + touch(SL_TEST, 1, MAP_PRIVATE); + unmap(SL_TEST, 1, MAP_PRIVATE); + + /* Explicit resizing during outstanding surplus */ + /* Consume surplus when growing pool */ + map(SL_TEST, 2, MAP_SHARED); + set_nr_hugepages(max(base_nr, 1)); + + /* Add pages once surplus is consumed */ + set_nr_hugepages(max(base_nr, 3)); + + /* Release free huge pages first */ + set_nr_hugepages(max(base_nr, 2)); + + /* When shrinking beyond committed level, increase surplus */ + set_nr_hugepages(base_nr); + + /* Upon releasing the reservation, reduce surplus counts */ + unmap(SL_TEST, 2, MAP_SHARED); + + verbose_printf("OK.\n"); +} + +int main(int argc, char ** argv) +{ + int base_nr; + + test_init(argc, argv); + hpage_size = check_hugepagesize(); + saved_nr_hugepages = get_huge_page_counter(hpage_size, HUGEPAGES_TOTAL); + verify_dynamic_pool_support(); + check_must_be_root(); + + if ((private_resv = kernel_has_private_reservations()) == -1) + FAIL("kernel_has_private_reservations() failed\n"); + + /* + * This test case should require a maximum of 3 huge pages. + * Run through the battery of tests multiple times, with an increasing + * base pool size. This alters the circumstances under which surplus + * pages need to be allocated and increases the corner cases tested. + */ + for (base_nr = 0; base_nr <= 3; base_nr++) { + verbose_printf("Base pool size: %i\n", base_nr); + /* Run the tests with a clean slate */ + run_test("Clean", base_nr); + + /* Now with a pre-existing untouched, shared mmap */ + map(SL_SETUP, 1, MAP_SHARED); + run_test("Untouched, shared", base_nr); + unmap(SL_SETUP, 1, MAP_SHARED); + + /* Now with a pre-existing untouched, private mmap */ + map(SL_SETUP, 1, MAP_PRIVATE); + run_test("Untouched, private", base_nr); + unmap(SL_SETUP, 1, MAP_PRIVATE); + + /* Now with a pre-existing touched, shared mmap */ + map(SL_SETUP, 1, MAP_SHARED); + touch(SL_SETUP, 1, MAP_SHARED); + run_test("Touched, shared", base_nr); + unmap(SL_SETUP, 1, MAP_SHARED); + + /* Now with a pre-existing touched, private mmap */ + map(SL_SETUP, 1, MAP_PRIVATE); + touch(SL_SETUP, 1, MAP_PRIVATE); + run_test("Touched, private", base_nr); + unmap(SL_SETUP, 1, MAP_PRIVATE); + } + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/counters.sh b/default/libhugetlbfs/libhugetlbfs/tests/counters.sh new file mode 100755 index 0000000..e3ffabe --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/counters.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +. wrapper-utils.sh + +# Huge page overcommit was not available until 2.6.24 +compare_kvers `uname -r` "2.6.24" +if [ $? -eq 1 ]; then + EXP_RC=$RC_FAIL +else + EXP_RC=$RC_PASS +fi + +exec_and_check $EXP_RC counters "$@" diff --git a/default/libhugetlbfs/libhugetlbfs/tests/direct.c b/default/libhugetlbfs/libhugetlbfs/tests/direct.c new file mode 100644 index 0000000..3418422 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/direct.c @@ -0,0 +1,101 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <sys/mman.h> +#include <fcntl.h> +#include <sys/types.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +#define P0 "ffffffff" +#define IOSZ 4096 +char buf[IOSZ] __attribute__ ((aligned (IOSZ))); +#define TMPFILE "/tmp/direct" + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd, dfd; + void *p; + size_t ret; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + dfd = open(TMPFILE, O_CREAT|O_EXCL|O_DIRECT|O_RDWR, 0600); + if (dfd < 0) + CONFIG("Failed to open direct-IO file: %s", strerror(errno)); + unlink(TMPFILE); + + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap hugetlbfs file: %s", strerror(errno)); + + memcpy(p, P0, 8); + + /* Direct write from huge page */ + ret = write(dfd, p, IOSZ); + if (ret == -1) + FAIL("Direct-IO write from huge page: %s", strerror(errno)); + if (ret != IOSZ) + FAIL("Short direct-IO write from huge page"); + if (lseek(dfd, 0, SEEK_SET) == -1) + FAIL("lseek: %s", strerror(errno)); + + /* Check for accuracy */ + ret = read(dfd, buf, IOSZ); + if (ret == -1) + FAIL("Direct-IO read to normal memory: %s", strerror(errno)); + if (ret != IOSZ) + FAIL("Short direct-IO read to normal memory"); + if (memcmp(P0, buf, 8)) + FAIL("Memory mismatch after Direct-IO write"); + if (lseek(dfd, 0, SEEK_SET) == -1) + FAIL("lseek: %s", strerror(errno)); + + /* Direct read to huge page */ + memset(p, 0, IOSZ); + ret = read(dfd, p, IOSZ); + if (ret == -1) + FAIL("Direct-IO read to huge page: %s\n", strerror(errno)); + if (ret != IOSZ) + FAIL("Short direct-IO read to huge page"); + + /* Check for accuracy */ + if (memcmp(p, P0, 8)) + FAIL("Memory mismatch after Direct-IO read"); + + close(dfd); + unlink(TMPFILE); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/dummy.c b/default/libhugetlbfs/libhugetlbfs/tests/dummy.c new file mode 100644 index 0000000..0c02ff1 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/dummy.c @@ -0,0 +1,31 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdio.h> +#include <stdlib.h> + +#include "hugetests.h" + +int main(int argc, char *argv[]) +{ + test_init(argc, argv); + + /* If we're even able to load, that's enough */ + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/empty_mounts.c b/default/libhugetlbfs/libhugetlbfs/tests/empty_mounts.c new file mode 100644 index 0000000..ea818ae --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/empty_mounts.c @@ -0,0 +1,69 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <dlfcn.h> +#include <stdarg.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +/* We override the normal open, so libhugetlbfs gets an apparently + * empty /proc/mounts or /etc/mtab */ +int open(const char *path, int flags, ...) +{ + int (*old_open)(const char *, int, ...); + int fd; + + if ((strcmp(path, "/proc/mounts") == 0) + || (strcmp(path, "/etc/mtab") == 0)) + path = "/dev/null"; + + old_open = dlsym(RTLD_NEXT, "open"); + if (flags & O_CREAT) { + va_list ap; + + va_start(ap, flags); + fd = (*old_open)(path, flags, va_arg(ap, mode_t)); + va_end(ap); + return fd; + } else { + return (*old_open)(path, flags); + } +} + +int main(int argc, char *argv[]) +{ + int fd; + + test_init(argc, argv); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + PASS(); + + FAIL("Mysteriously found a mount"); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/fadvise_reserve.c b/default/libhugetlbfs/libhugetlbfs/tests/fadvise_reserve.c new file mode 100644 index 0000000..9d72677 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/fadvise_reserve.c @@ -0,0 +1,86 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#define _XOPEN_SOURCE 600 +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> +#include "hugetests.h" + +/* + * Test rationale: + * + * fadvise() on some kernels can cause the reservation counter to get + * corrupted. The problem is that the patches are allocated for the + * reservation but not faulted in at the time of allocation. The + * counters do not get updated and effectively "leak". This test + * identifies whether the kernel is vunerable to the problem or not. + * It's fixed in kernel by commit f2deae9d4e70793568ef9e85d227abb7bef5b622. + */ +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p; + unsigned long initial_rsvd, map_rsvd, fadvise_rsvd, end_rsvd; + + test_init(argc, argv); + + /* Setup */ + hpage_size = check_hugepagesize(); + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + initial_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count before map: %lu\n", initial_rsvd); + + /* mmap a region and record reservations */ + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + map_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count after map: %lu\n", map_rsvd); + + /* fadvise the region and record reservations */ + if (posix_fadvise(fd, 0, hpage_size, POSIX_FADV_WILLNEED) == -1) + FAIL("fadvise(): %s", strerror(errno)); + fadvise_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count after fadvise: %lu\n", fadvise_rsvd); + + /* Write the region */ + memset(p, 1, hpage_size); + + /* Free region */ + munmap(p, hpage_size); + close(fd); + end_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count after close(): %lu\n", end_rsvd); + + /* Reserve count should match initial reserve count */ + if (end_rsvd != initial_rsvd) + FAIL("Reserve leaked: %lu != %lu\n", end_rsvd, initial_rsvd); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/fadvise_reserve.sh b/default/libhugetlbfs/libhugetlbfs/tests/fadvise_reserve.sh new file mode 100755 index 0000000..74496ec --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/fadvise_reserve.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +. wrapper-utils.sh + +# fadvise is known broken before 2.6.30 +compare_kvers `uname -r` "2.6.30" +if [ $? -eq 1 ]; then + echo "FAIL (assumed) kernel bug" + exit $RC_FAIL +else + EXP_RC=$RC_PASS + exec_and_check $EXP_RC fadvise_reserve "$@" +fi + diff --git a/default/libhugetlbfs/libhugetlbfs/tests/find_path.c b/default/libhugetlbfs/libhugetlbfs/tests/find_path.c new file mode 100644 index 0000000..86019da --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/find_path.c @@ -0,0 +1,44 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdio.h> +#include <stdlib.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +int main(int argc, char *argv[]) +{ + const char *dir; + + test_init(argc, argv); + + dir = hugetlbfs_find_path(); + + if (! dir) + CONFIG("No hugepage mount"); + + verbose_printf("Found hugetlbfs path at %s\n", dir); + + if (hugetlbfs_test_path(dir) == 1) + PASS(); + + FAIL(""); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/fork-cow.c b/default/libhugetlbfs/libhugetlbfs/tests/fork-cow.c new file mode 100644 index 0000000..70d3904 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/fork-cow.c @@ -0,0 +1,176 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 David Gibson, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#define _GNU_SOURCE + +#include <sys/types.h> +#include <sys/shm.h> +#include <sys/wait.h> +#include <sys/mman.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <hugetlbfs.h> +#include "hugetests.h" + +/* + * Test rationale: + * + * This checks copy-on-write semantics, specifically the semantics of + * a MAP_PRIVATE mapping across a fork(). Some versions of the + * powerpc kernel had a bug in huge_ptep_set_wrprotect() which would + * fail to flush the hash table after setting the write protect bit in + * the parent's page tables, thus allowing the parent to pollute the + * child's mapping. + */ + +#define RANDOM_CONSTANT 0x1234ABCD +#define OTHER_CONSTANT 0xfeef5678 + +/* + * The parent uses this to check if the child terminated badly. + */ +static void sigchld_handler(int signum, siginfo_t *si, void *uc) +{ + if (WEXITSTATUS(si->si_status) != 0) + FAIL("Child failed: %d", WEXITSTATUS(si->si_status)); + if (WIFSIGNALED(si->si_status)) + FAIL("Child recived signal %s", + strsignal(WTERMSIG(si->si_status))); +} + +int main(int argc, char ** argv) +{ + int fd, ret, status; + void *syncarea; + volatile unsigned int *p; + volatile unsigned int *trigger, *child_readback; + unsigned int parent_readback; + long hpage_size; + pid_t pid; + struct sigaction sa = { + .sa_sigaction = sigchld_handler, + .sa_flags = SA_SIGINFO, + }; + + test_init(argc, argv); + + check_free_huge_pages(2); + + if (argc != 1) + CONFIG("Usage: fork-cow\n"); + + /* Get a shared normal page for synchronization */ + verbose_printf("Mapping synchronization area.."); + syncarea = mmap(NULL, getpagesize(), PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_ANONYMOUS, -1, 0); + if (syncarea == MAP_FAILED) + FAIL("mmap() sync area: %s", strerror(errno)); + verbose_printf("done\n"); + + trigger = syncarea; + *trigger = 0; + + child_readback = trigger + 1; + *child_readback = 0; + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + CONFIG("hugetlbfs_unlinked_fd() failed\n"); + + verbose_printf("Mapping hugepage area..."); + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + verbose_printf("mapped at %p\n", p); + + /* Touch the page for write in parent */ + verbose_printf("Parent writes pre-fork..."); + *p = RANDOM_CONSTANT; + verbose_printf("%x\n", RANDOM_CONSTANT); + + ret = sigaction(SIGCHLD, &sa, NULL); + if (ret) + FAIL("sigaction(): %s", strerror(errno)); + + if ((pid = fork()) < 0) + FAIL("fork(): %s", strerror(errno)); + + if (pid != 0) { + /* Parent */ + verbose_printf("Parent writes post-fork..."); + *p = ~RANDOM_CONSTANT; + verbose_printf("%x\n", ~RANDOM_CONSTANT); + + *trigger = 1; + + while (*trigger != 2) + ; + + verbose_printf("Parent reads.."); + parent_readback = *p; + verbose_printf("%x\n", parent_readback); + + *trigger = 3; + } else { + /* Child */ + verbose_printf("Child starts..\n"); + + while (*trigger != 1) + ; + + verbose_printf("Child reads..."); + *child_readback = *p; + verbose_printf("%x\n", *child_readback); + + verbose_printf("Child writes..."); + *p = OTHER_CONSTANT; + verbose_printf("%x\n", OTHER_CONSTANT); + + *trigger = 2; + + while (*trigger != 3) + ; + + verbose_printf("Child exits...\n"); + exit(0); + } + + verbose_printf("child_readback = 0x%x, parent_readback = 0x%x\n", + *child_readback, parent_readback); + + if (*child_readback != RANDOM_CONSTANT) + FAIL("Child read back 0x%x instead of 0x%x", + *child_readback, RANDOM_CONSTANT); + if (parent_readback != ~RANDOM_CONSTANT) + FAIL("Parent read back 0x%x instead of 0x%x", + parent_readback, RANDOM_CONSTANT); + + ret = waitpid(pid, &status, 0); + if (ret < 0) + FAIL("waitpid(): %s", strerror(errno)); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/get_huge_pages.c b/default/libhugetlbfs/libhugetlbfs/tests/get_huge_pages.c new file mode 100644 index 0000000..2dc4e3d --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/get_huge_pages.c @@ -0,0 +1,76 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +long hpage_size; +long oc_hugepages = -1; + +/* Restore nr_overcommit_hugepages */ +void cleanup(void) +{ + if (oc_hugepages != -1) + set_nr_overcommit_hugepages(hpage_size, oc_hugepages); +} + +/* Confirm a region really frees, only really important for GHP_FALLBACK */ +void free_and_confirm_region_free(void *p, int line) { + unsigned char vec = 0; + free_huge_pages(p); + if (mincore(p, 4, &vec) == 0 || vec) + FAIL("free_huge_pages did not free region at line %d", line); +} + +void test_get_huge_pages(int num_hugepages) +{ + unsigned long long mapping_size; + void *p = get_huge_pages(num_hugepages * hpage_size, GHP_DEFAULT); + if (p == NULL) + FAIL("get_huge_pages() for %d hugepages", num_hugepages); + + memset(p, 1, hpage_size); + + mapping_size = get_mapping_page_size( + (void *)p + (num_hugepages -1) * hpage_size); + if (mapping_size != hpage_size) + FAIL("Returned page is not hugepage"); + + free_and_confirm_region_free(p, __LINE__); + mapping_size = get_mapping_page_size( + (void *)p + (num_hugepages -1) * hpage_size); + if (mapping_size) + FAIL("hugepage was not correctly freed"); +} + +int main(int argc, char *argv[]) +{ + test_init(argc, argv); + hpage_size = gethugepagesize(); + check_free_huge_pages(4); + test_get_huge_pages(1); + test_get_huge_pages(4); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/get_hugepage_region.c b/default/libhugetlbfs/libhugetlbfs/tests/get_hugepage_region.c new file mode 100644 index 0000000..292d201 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/get_hugepage_region.c @@ -0,0 +1,137 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +long hpage_size; +long oc_hugepages = -1; + +/* Restore nr_overcommit_hugepages */ +void cleanup(void) +{ + if (oc_hugepages != -1) + set_nr_overcommit_hugepages(hpage_size, oc_hugepages); +} + +/* Confirm a region really frees, only really important for GHR_FALLBACK */ +void free_and_confirm_region_free(void *p, int line) { + unsigned char vec = 0; + free_hugepage_region(p); + if (mincore(p, 4, &vec) == 0 || vec) + FAIL("free_hugepage_region did not free region at line %d", line); +} + +int test_unaligned_addr_huge(void *p) +{ + unsigned long long mapping_size; + p = (void *)((unsigned long)p & ~((gethugepagesize()) - 1)); + mapping_size = get_mapping_page_size(p); + return (mapping_size == hpage_size); +} + +#define TESTLEN ((num_hugepages - 1) * hpage_size + hpage_size / 2) + +void test_GHR_STRICT(int num_hugepages) +{ + int err; + void *p = get_hugepage_region(TESTLEN, GHR_DEFAULT); + if (p == NULL) + FAIL("get_hugepage_region() for %d hugepages", num_hugepages); + + memset(p, 1, TESTLEN); + + err = test_unaligned_addr_huge(p + (num_hugepages - 1) * hpage_size); + if (err != 1) + FAIL("Returned page is not hugepage"); + + free_and_confirm_region_free(p, __LINE__); + err = test_unaligned_addr_huge(p); + if (err == 1) + FAIL("hugepage was not correctly freed"); +} + +void test_GHR_FALLBACK(void) +{ + int err; + long rsvd_hugepages = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + long num_hugepages = get_huge_page_counter(hpage_size, HUGEPAGES_TOTAL) + - rsvd_hugepages; + + /* We must disable overcommitted huge pages to test this */ + oc_hugepages = get_huge_page_counter(hpage_size, HUGEPAGES_OC); + set_nr_overcommit_hugepages(hpage_size, 0); + + /* We should be able to allocate the whole pool */ + void *p = get_hugepage_region(TESTLEN, GHR_DEFAULT); + if (p == NULL) + FAIL("test_GHR_FALLBACK(GHR_DEFAULT) failed for %ld hugepages", + num_hugepages); + memset(p, 1, TESTLEN); + err = test_unaligned_addr_huge(p + (num_hugepages - 1) * hpage_size); + if (err != 1) + FAIL("Returned page is not hugepage"); + free_and_confirm_region_free(p, __LINE__); + + /* We should fail allocating too much */ + num_hugepages++; + p = get_hugepage_region(TESTLEN, GHR_STRICT); + if (p != NULL) + FAIL("test_GHR_FALLBACK() for %ld expected fail, got success", num_hugepages); + + /* GHR_FALLBACK should succeed by allocating base pages */ + p = get_hugepage_region(TESTLEN, GHR_FALLBACK); + if (p == NULL) + FAIL("test_GHR_FALLBACK(GHR_FALLBACK) failed for %ld hugepages", + num_hugepages); + memset(p, 1, TESTLEN); + err = test_unaligned_addr_huge(p + (num_hugepages - 1) * hpage_size); + if (err == 1) + FAIL("Returned page is not a base page"); + + /* + * We allocate a second fallback region to see can they be told apart + * on free. Merging VMAs would cause problems + */ + void *pb = get_hugepage_region(TESTLEN, GHR_FALLBACK); + if (pb == NULL) + FAIL("test_GHR_FALLBACK(GHR_FALLBACK) x2 failed for %ld hugepages", + num_hugepages); + memset(pb, 1, TESTLEN); + + free_and_confirm_region_free(pb, __LINE__); + free_and_confirm_region_free(p, __LINE__); +} + +int main(int argc, char *argv[]) +{ + test_init(argc, argv); + hpage_size = gethugepagesize(); + check_free_huge_pages(4); + test_GHR_STRICT(1); + test_GHR_STRICT(4); + test_GHR_FALLBACK(); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/get_hugetlbfs_path.c b/default/libhugetlbfs/libhugetlbfs/tests/get_hugetlbfs_path.c new file mode 100644 index 0000000..a3de22a --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/get_hugetlbfs_path.c @@ -0,0 +1,40 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * Copyright (C) 2006 Nishanth Aravamudan, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdio.h> +#include <stdlib.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +int main(int argc, char *argv[]) +{ + const char *dir; + + dir = hugetlbfs_find_path(); + + if (!dir) + return -1; + + printf("%s\n", dir); + + return 0; +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/gethugepagesize.c b/default/libhugetlbfs/libhugetlbfs/tests/gethugepagesize.c new file mode 100644 index 0000000..7668b04 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/gethugepagesize.c @@ -0,0 +1,44 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdio.h> +#include <stdlib.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +int main(int argc, char *argv[]) +{ + long hpage_size; + + test_init(argc, argv); + + hpage_size = gethugepagesize(); + + if (hpage_size > 0) { + verbose_printf("Huge page size is %ld bytes\n", hpage_size); + PASS(); + } + + if (hpage_size < 0) + CONFIG("No hugepage kernel support"); + + FAIL(""); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/gethugepagesizes.c b/default/libhugetlbfs/libhugetlbfs/tests/gethugepagesizes.c new file mode 100644 index 0000000..860b9a5 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/gethugepagesizes.c @@ -0,0 +1,412 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <dirent.h> +#include <dlfcn.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stdarg.h> +#include <hugetlbfs.h> + +#include "hugetests.h" + +int faked_data = 0; +char fake_sysfs[] = "/tmp/sysfs-XXXXXX"; +char fake_meminfo[] = "/tmp/meminfo-XXXXXX"; + +#define REAL_SYSFS_DIR "/sys/kernel/mm/hugepages/" +DIR *(*real_opendir)(const char *name); + +int (*real_open)(const char *name, int flags, int mode); + +enum { + OVERRIDE_OFF, /* Pass-through to real function */ + OVERRIDE_ON, /* Ovewrride with local function */ + OVERRIDE_MISSING, /* Emulate missing support */ +}; +int meminfo_state = OVERRIDE_OFF; +int sysfs_state = OVERRIDE_OFF; + +/* + * Override opendir so we'll open the fake sysfs dir if intended + */ +DIR *opendir(const char *name) +{ + if (!real_opendir) + real_opendir = dlsym(RTLD_NEXT, "opendir"); + + /* Only override calls to the sysfs dir */ + if (strcmp(name, REAL_SYSFS_DIR)) + return real_opendir(name); + + switch (sysfs_state) { + case OVERRIDE_OFF: + return real_opendir(name); + case OVERRIDE_ON: + /* Only safe to override of fake_sysfs was set up */ + if (faked_data) + return real_opendir(fake_sysfs); + else + FAIL("Trying to override opendir before initializing " + "fake_sysfs directory\n"); + default: + errno = ENOENT; + return NULL; + } +} + +#define HPAGE_KB 2048 +#define __HPAGE_STR_QUOTE(val) #val +#define __HPAGE_STR(val) __HPAGE_STR_QUOTE(val) +#define HPAGE_STR __HPAGE_STR(HPAGE_KB) + +/* + * Override open to simulate various contents for meminfo + */ +int open(const char *file, int flags, ...) +{ + int mode = 0; + if (flags & O_CREAT) { + va_list arg; + va_start(arg, flags); + mode = va_arg(arg, int); + va_end(arg); + } + + if (!real_open) + real_open = dlsym(RTLD_NEXT, "open"); + + switch (meminfo_state) { + case OVERRIDE_OFF: + break; + case OVERRIDE_ON: { + char fname[PATH_MAX]; + sprintf(fname, "%s/meminfo-hugepages", fake_meminfo); + file = fname; + break; + } + case OVERRIDE_MISSING: { + char fname[PATH_MAX]; + sprintf(fname, "%s/meminfo-none", fake_meminfo); + file = fname; + break; + } + default: + return -1; + } + return real_open(file, flags, mode); +} + +void cleanup_fake_data(void) +{ + DIR *dir; + struct dirent *ent; + char fname[PATH_MAX+1]; + + meminfo_state = OVERRIDE_OFF; + sysfs_state = OVERRIDE_OFF; + + faked_data = 0; + dir = opendir(fake_sysfs); + if (!dir) + FAIL("opendir %s: %s", fake_sysfs, strerror(errno)); + + while ((ent = readdir(dir))) { + if (strncmp(ent->d_name, "hugepages-", 10)) + continue; + snprintf(fname, PATH_MAX, "%s/%s", fake_sysfs, + ent->d_name); + if (rmdir(fname)) + FAIL("rmdir %s: %s", fake_sysfs, strerror(errno)); + } + closedir(dir); + if (rmdir(fake_sysfs)) + FAIL("rmdir %s: %s", fake_sysfs, strerror(errno)); + + sprintf(fname, "%s/meminfo-none", fake_meminfo); + if (unlink(fname) < 0) + FAIL("unlink %s: %s", fname, strerror(errno)); + sprintf(fname, "%s/meminfo-hugepages", fake_meminfo); + if (unlink(fname) < 0) + FAIL("unlink %s: %s", fname, strerror(errno)); + if (rmdir(fake_meminfo)) + FAIL("rmdir %s: %s", fake_meminfo, strerror(errno)); +} + +char *meminfo_base = "\ +MemTotal: 4004132 kB\n\ +MemFree: 3563748 kB\n\ +Buffers: 34804 kB\n\ +Cached: 252544 kB\n\ +SwapCached: 0 kB\n\ +Active: 108912 kB\n\ +Inactive: 187420 kB\n\ +SwapTotal: 8008392 kB\n\ +SwapFree: 8008392 kB\n\ +Dirty: 4 kB\n\ +Writeback: 0 kB\n\ +AnonPages: 9100 kB\n\ +Mapped: 7908 kB\n\ +Slab: 40212 kB\n\ +SReclaimable: 33312 kB\n\ +SUnreclaim: 6900 kB\n\ +PageTables: 1016 kB\n\ +NFS_Unstable: 0 kB\n\ +Bounce: 0 kB\n\ +WritebackTmp: 0 kB\n\ +CommitLimit: 9974616 kB\n\ +Committed_AS: 29616 kB\n\ +VmallocTotal: 34359738367 kB\n\ +VmallocUsed: 23760 kB\n\ +VmallocChunk: 34359714543 kB\n\ +"; + +char *meminfo_huge = "\ +HugePages_Total: 35\n\ +HugePages_Free: 35\n\ +HugePages_Rsvd: 0\n\ +HugePages_Surp: 0\n\ +Hugepagesize: " HPAGE_STR " kB\n\ +"; + +void setup_fake_data(long sizes[], int n_elem) +{ + int old_meminfo_state = meminfo_state; + int old_sysfs_state = sysfs_state; + + int i; + char fname[PATH_MAX+1]; + int fd; + + meminfo_state = OVERRIDE_OFF; + sysfs_state = OVERRIDE_OFF; + + if (faked_data) + cleanup_fake_data(); + + /* Generate some fake sysfs data. */ + if (!mkdtemp(fake_sysfs)) + FAIL("mkdtemp: %s", strerror(errno)); + faked_data = 1; + + for (i = 0; i < n_elem; i++) { + snprintf(fname, PATH_MAX, "%s/hugepages-%lukB", fake_sysfs, + sizes[i] / 1024); + if (mkdir(fname, 0700)) + FAIL("mkdir %s: %s", fname, strerror(errno)); + } + + /* Generate fake meminfo data. */ + if (!mkdtemp(fake_meminfo)) + FAIL("mkdtemp: %s", strerror(errno)); + + sprintf(fname, "%s/meminfo-none", fake_meminfo); + fd = open(fname, O_WRONLY|O_CREAT); + if (fd < 0) + FAIL("open: %s", strerror(errno)); + if (write(fd, meminfo_base, + strlen(meminfo_base)) != strlen(meminfo_base)) + FAIL("write: %s", strerror(errno)); + if (close(fd) < 0) + FAIL("close: %s", strerror(errno)); + + sprintf(fname, "%s/meminfo-hugepages", fake_meminfo); + fd = open(fname, O_WRONLY|O_CREAT); + if (fd < 0) + FAIL("open: %s", strerror(errno)); + if (write(fd, meminfo_base, + strlen(meminfo_base)) != strlen(meminfo_base)) + FAIL("write: %s", strerror(errno)); + if (write(fd, meminfo_huge, + strlen(meminfo_huge)) != strlen(meminfo_huge)) + FAIL("write: %s", strerror(errno)); + if (close(fd) < 0) + FAIL("close: %s", strerror(errno)); + + meminfo_state = old_meminfo_state; + sysfs_state = old_sysfs_state; +} + +void cleanup(void) +{ + if (faked_data) + cleanup_fake_data(); +} + +void validate_sizes(int line, long actual_sizes[], int actual, + int max, int maxmax, + long expected_sizes[], int expected) +{ + int i, j; + + verbose_printf("Line %d: Expecting sizes:", line); + for (i = 0; i < expected; i++) + verbose_printf(" %ld", expected_sizes[i]); + verbose_printf("\n"); + verbose_printf("Line %d: Actual sizes are:", line); + for (i = 0; i < actual; i++) + verbose_printf(" %ld", actual_sizes[i]); + verbose_printf("\n"); + + if (((expected <= max) && (expected != actual)) + || ((expected > max) && (actual < max))) + FAIL("Line %i: Wrong number of sizes returned -- expected %i " + "got %i", line, expected, actual); + else if (actual > max) + FAIL("Line %i: %i sizes returned > maximum %i", + line, actual, max); + + for (i = 0; i < actual; i++) { + for (j = 0; j < expected; j++) + if (actual_sizes[i] == expected_sizes[j]) + break; + if (j >= expected) + FAIL("Line %i: Actual size %li not found in expected " + "results", line, expected_sizes[i]); + } + + for (i = 0; i < actual; i++) + for (j = i+1; j < actual; j++) + if (actual_sizes[i] == actual_sizes[j]) + FAIL("Line %i: Duplicate size %li at %i/%i", + line, actual_sizes[i], i, j); + + for (i = actual; i < maxmax; i++) + if (actual_sizes[i] != 42) + FAIL("Line %i: Wrote past official limit at %i", + line, i); +} + +#define MAX 16 +#define EXPECT_SIZES(func, max, count, expected) \ +({ \ + long __a[MAX] = { [0 ... MAX-1] = 42 }; \ + int __na; \ + \ + __na = func(__a, max); \ + \ + validate_sizes(__LINE__, __a, __na, max, MAX, expected, count); \ + \ + __na; \ +}) + +#define INIT_LIST(a, values...) \ +({ \ + long __e[] = { values }; \ + memcpy(a, __e, sizeof(__e)); \ +}) + +int main(int argc, char *argv[]) +{ + long expected_sizes[MAX], actual_sizes[MAX]; + long base_size = sysconf(_SC_PAGESIZE); + + test_init(argc, argv); + + /* + * === + * Argment error checking tests + * === + */ + meminfo_state = OVERRIDE_OFF; + sysfs_state = OVERRIDE_OFF; + kernel_default_hugepage_size_reset(); + + if (gethugepagesizes(actual_sizes, -1) != -1 || errno != EINVAL) + FAIL("Mishandled params (n_elem < 0)"); + if (gethugepagesizes(NULL, 1) != -1 || errno != EINVAL) + FAIL("Mishandled params (pagesizes == NULL, n_elem > 0)"); + + if (getpagesizes(actual_sizes, -1) != -1 || errno != EINVAL) + FAIL("Mishandled params (n_elem < 0)"); + if (getpagesizes(NULL, 1) != -1 || errno != EINVAL) + FAIL("Mishandled params (pagesizes == NULL, n_elem > 0)"); + + /* + * === + * Test some corner cases using a fake system configuration + * === + */ + + INIT_LIST(expected_sizes, HPAGE_KB * 1024, 1024 * 1024, 64 * 1024); + setup_fake_data(expected_sizes, 3); + + /* + * Check handling when /proc/meminfo indicates no huge page support + * and the sysfs heirachy is not present. + */ + meminfo_state = OVERRIDE_MISSING; + sysfs_state = OVERRIDE_MISSING; + kernel_default_hugepage_size_reset(); + + EXPECT_SIZES(gethugepagesizes, MAX, 0, expected_sizes); + + INIT_LIST(expected_sizes, base_size); + EXPECT_SIZES(getpagesizes, MAX, 1, expected_sizes); + + /* ... only the meminfo size is returned. */ + meminfo_state = OVERRIDE_ON; + kernel_default_hugepage_size_reset(); + + INIT_LIST(expected_sizes, HPAGE_KB * 1024); + EXPECT_SIZES(gethugepagesizes, MAX, 1, expected_sizes); + + INIT_LIST(expected_sizes, base_size, HPAGE_KB * 1024); + EXPECT_SIZES(getpagesizes, MAX, 2, expected_sizes); + + /* + * When sysfs defines additional sizes ... + */ + sysfs_state = OVERRIDE_ON; + kernel_default_hugepage_size_reset(); + + INIT_LIST(expected_sizes, HPAGE_KB * 1024, 1024 * 1024, 64 * 1024); + + /* ... make sure all sizes are returned without duplicates */ + /* ... while making sure we do not overstep our limit */ + EXPECT_SIZES(gethugepagesizes, MAX, 3, expected_sizes); + EXPECT_SIZES(gethugepagesizes, 1, 3, expected_sizes); + EXPECT_SIZES(gethugepagesizes, 2, 3, expected_sizes); + EXPECT_SIZES(gethugepagesizes, 3, 3, expected_sizes); + EXPECT_SIZES(gethugepagesizes, 4, 3, expected_sizes); + + INIT_LIST(expected_sizes, + base_size, HPAGE_KB * 1024, 1024 * 1024, 64 * 1024); + EXPECT_SIZES(getpagesizes, MAX, 4, expected_sizes); + EXPECT_SIZES(getpagesizes, 1, 4, expected_sizes); + EXPECT_SIZES(getpagesizes, 2, 4, expected_sizes); + EXPECT_SIZES(getpagesizes, 3, 4, expected_sizes); + EXPECT_SIZES(getpagesizes, 4, 4, expected_sizes); + EXPECT_SIZES(getpagesizes, 5, 4, expected_sizes); + + /* ... we can check how many sizes are supported. */ + if (gethugepagesizes(NULL, 0) != 3) + FAIL("Unable to check the number of supported sizes"); + + if (getpagesizes(NULL, 0) != 4) + FAIL("Unable to check the number of supported sizes"); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/heap-overflow.c b/default/libhugetlbfs/libhugetlbfs/tests/heap-overflow.c new file mode 100644 index 0000000..044c3fd --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/heap-overflow.c @@ -0,0 +1,110 @@ +/* + * Test heap overflow for libhugetlbfs. + * Copyright 2008 Cray Inc. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, 5th Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <sys/wait.h> +#include <hugetlbfs.h> +#include "hugetests.h" + +long oc_pool = -1; +long hpagesize; + +void cleanup(void) +{ + if (oc_pool > 0) + restore_overcommit_pages(hpagesize, oc_pool); +} + +int main(int argc, char **argv) +{ + int freepages; + long size1, size2; + void *p1, *p2; + int st, pid, rv; + unsigned long long mapping_size; + + test_init(argc, argv); + + if (!getenv("HUGETLB_MORECORE")) + CONFIG("Must have HUGETLB_MORECORE=yes"); + + hpagesize = check_hugepagesize(); + + /* Must be root because this test modifies the overcommit pool */ + check_must_be_root(); + + oc_pool = read_nr_overcommit(hpagesize); + if (oc_pool > 0) + set_nr_overcommit_hugepages(hpagesize, 0); + + freepages = get_huge_page_counter(hpagesize, HUGEPAGES_FREE); + if (freepages < 3) + CONFIG("Must have at least 3 free hugepages"); + + /* + * Allocation 1: one hugepage. Due to malloc overhead, morecore + * will probably mmap two hugepages. + */ + size1 = hpagesize; + p1 = malloc(size1); + if (!p1) + FAIL("Couldn't malloc %ld bytes", size1); + mapping_size = get_mapping_page_size(p1); + if (mapping_size != hpagesize) + FAIL("First allocation %p not on hugepages", p1); + + /* + * Allocation 2: all free hugepages to ensure we exhaust the free pool. + */ + size2 = freepages * hpagesize; + p2 = malloc(size2); + if (!p2) + FAIL("Couldn't malloc %ld bytes", size2); + mapping_size = get_mapping_page_size(p1); + st = (mapping_size == hpagesize); + verbose_printf("Second allocation %p huge? %s\n", p2, st < 0 ? "??" : + (st ? "yes" : "no")); + + /* + * Touch the pages in a child process. Kernel sends a SIGKILL if + * we run out of hugepages. + */ + pid = fork(); + if (pid < 0) + FAIL("fork: %s", strerror(errno)); + + if (pid == 0) { + memset(p1, 0, size1); + memset(p2, 0, size2); + exit(0); + } + + rv = waitpid(pid, &st, 0); + if (rv < 0) + FAIL("waitpid: %s\n", strerror(errno)); + if (WIFSIGNALED(st)) + FAIL("Child killed by signal %d touching malloc'ed memory", + WTERMSIG(st)); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/heapshrink-helper.c b/default/libhugetlbfs/libhugetlbfs/tests/heapshrink-helper.c new file mode 100644 index 0000000..e793ff6 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/heapshrink-helper.c @@ -0,0 +1,25 @@ +/* + * Test heap shrinking for libhugetlbfs. + * Copyright 2008 Cray Inc. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, 5th Floor, Boston, MA 02110-1301, USA. + */ + +#include <malloc.h> + +static void __attribute__((constructor)) setup_heapshrink_helper(void) +{ + (void) malloc(1); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/heapshrink.c b/default/libhugetlbfs/libhugetlbfs/tests/heapshrink.c new file mode 100644 index 0000000..0644c78 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/heapshrink.c @@ -0,0 +1,74 @@ +/* + * Test heap shrinking for libhugetlbfs. + * Copyright 2007 Cray Inc. All rights reserved. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, 5th Floor, Boston, MA 02110-1301, USA. + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include "hugetests.h" + +/* + * We cannot test mapping size against huge page size because we are not linked + * against libhugetlbfs so gethugepagesize() won't work. So instead we define + * our MIN_PAGE_SIZE as 64 kB (the largest base page available) and make sure + * the mapping page size is larger than this. + */ +#define MIN_PAGE_SIZE 65536 + +#define SIZE (32 * 1024 * 1024) + +int main(int argc, char **argv) +{ + int is_huge, have_env, shrink_ok, have_helper; + unsigned long long mapping_size; + void *p; + + test_init(argc, argv); + + have_env = getenv("HUGETLB_MORECORE") != NULL; + shrink_ok = getenv("HUGETLB_MORECORE_SHRINK") != NULL; + p = getenv("LD_PRELOAD"); + have_helper = p != NULL && strstr(p, "heapshrink") != NULL; + + p = malloc(SIZE); + if (!p) { + if (shrink_ok && have_helper) { + /* Hitting unexpected behavior in malloc() */ + PASS_INCONCLUSIVE(); + } else + FAIL("malloc(%d) failed\n", SIZE); + } + memset(p, 0, SIZE); + mapping_size = get_mapping_page_size(p); + is_huge = (mapping_size > MIN_PAGE_SIZE); + if (have_env && !is_huge) { + if (shrink_ok && have_helper) { + /* Hitting unexpected behavior in malloc() */ + PASS_INCONCLUSIVE(); + } else + FAIL("Heap not on hugepages"); + } + if (!have_env && is_huge) + FAIL("Heap unexpectedly on hugepages"); + + free(p); + mapping_size = get_mapping_page_size(p+SIZE-1); + if (shrink_ok && mapping_size > MIN_PAGE_SIZE) + FAIL("Heap did not shrink"); + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/huge_at_4GB_normal_below.c b/default/libhugetlbfs/libhugetlbfs/tests/huge_at_4GB_normal_below.c new file mode 100644 index 0000000..4134d03 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/huge_at_4GB_normal_below.c @@ -0,0 +1,94 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +/* + * Test rationale: + * + * Designed to pick up a bug on ppc64 where + * touches_hugepage_high_range() falsely reported true for ranges + * reaching below 4GB + * + * WARNING: The offsets and addresses used within are specifically + * calculated to trigger the bug as it existed. Don't mess with them + * unless you *really* know what you're doing. + */ + +int main(int argc, char *argv[]) +{ + int page_size; + long hpage_size; + int fd; + void *p, *q; + unsigned long lowaddr; + int err; + + test_init(argc, argv); + + page_size = getpagesize(); + + hpage_size = check_hugepagesize(); + + if (sizeof(void *) <= 4) + IRRELEVANT(); + + if (hpage_size > FOURGB) + CONFIG("Huge page size is too large"); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + p = mmap((void *)FOURGB, hpage_size, PROT_READ|PROT_WRITE, + MAP_SHARED | MAP_FIXED, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() huge: %s", strerror(errno)); + if (p != (void *)FOURGB) + FAIL("Wrong address with MAP_FIXED huge"); + + verbose_printf("Mapped hugetlb at %p\n", p); + + memset(p, 0, hpage_size); + + err = test_addr_huge(p); + if (err != 1) + FAIL("Mapped address is not hugepage"); + + /* Test just below 4GB to check for off-by-one errors */ + lowaddr = FOURGB - page_size; + q = mmap((void *)lowaddr, page_size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_FIXED|MAP_ANONYMOUS, 0, 0); + if (q == MAP_FAILED) + FAIL("mmap() normal: %s", strerror(errno)); + if (q != (void *)lowaddr) + FAIL("Wrong address with MAP_FIXED normal"); + + memset(q, 0, page_size); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/huge_below_4GB_normal_above.c b/default/libhugetlbfs/libhugetlbfs/tests/huge_below_4GB_normal_above.c new file mode 100644 index 0000000..7747894 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/huge_below_4GB_normal_above.c @@ -0,0 +1,117 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +/* + * Test rationale: + * + * Designed to pick up a bug on ppc64 where + * touches_hugepage_low_range() could give false positives because of + * the peculiar (undefined) behaviour of << for large shifts + * + * WARNING: The offsets and addresses used within are specifically + * calculated to trigger the bug as it existed. Don't mess with them + * unless you *really* know what you're doing. + */ + +int main(int argc, char *argv[]) +{ + int page_size; + long hpage_size; + int fd; + void *p, *q; + unsigned long lowaddr, highaddr; + int err; + + test_init(argc, argv); + + page_size = getpagesize(); + + hpage_size = check_hugepagesize(); + + if (sizeof(void *) <= 4) + IRRELEVANT(); + + if (hpage_size > FOURGB) + CONFIG("Huge page size is too large"); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + + + /* We use a low address right below 4GB so we can test for + * off-by-one errors */ + lowaddr = FOURGB - hpage_size; + verbose_printf("Mapping hugepage at at %lx...", lowaddr); + p = mmap((void *)lowaddr, hpage_size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_FIXED, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() huge: %s", strerror(errno)); + if (p != (void *)lowaddr) + FAIL("Wrong address with MAP_FIXED huge"); + verbose_printf("done\n"); + + memset(p, 0, hpage_size); + + err = test_addr_huge(p); + if (err != 1) + FAIL("Mapped address is not hugepage"); + + /* Test for off by one errors */ + highaddr = FOURGB; + verbose_printf("Mapping normal page at %lx...", highaddr); + q = mmap((void *)highaddr, page_size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_FIXED|MAP_ANONYMOUS, 0, 0); + if (q == MAP_FAILED) + FAIL("mmap() normal 1: %s", strerror(errno)); + if (q != (void *)highaddr) + FAIL("Wrong address with MAP_FIXED normal 2"); + verbose_printf("done\n"); + + memset(q, 0, page_size); + + /* Why this address? Well on ppc64, we're working with 256MB + * segment numbers, hence >>28. In practice the shift + * instructions only start wrapping around with shifts 128 or + * greater. */ + highaddr = ((lowaddr >> 28) + 128) << 28; + verbose_printf("Mapping normal page at %lx...", highaddr); + q = mmap((void *)highaddr, page_size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_FIXED|MAP_ANONYMOUS, 0, 0); + if (q == MAP_FAILED) + FAIL("mmap() normal 2: %s", strerror(errno)); + if (q != (void *)highaddr) + FAIL("Wrong address with MAP_FIXED normal 2"); + verbose_printf("done\n"); + + memset(q, 0, page_size); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/hugetests.h b/default/libhugetlbfs/libhugetlbfs/tests/hugetests.h new file mode 100644 index 0000000..a5a54d6 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/hugetests.h @@ -0,0 +1,142 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MECHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HUGETESTS_H +#define _HUGETESTS_H + +#include <errno.h> +#include <string.h> + +#include "libhugetlbfs_privutils.h" +#include "libhugetlbfs_testprobes.h" + +#define DEBUG + +/* Test return codes */ +#define RC_PASS 0 +#define RC_CONFIG 1 +#define RC_FAIL 2 +#define RC_XFAIL 3 /* Expected Failure */ +#define RC_XPASS 4 /* Unexpected Pass */ +#define RC_BUG 99 + +#define FOURGB (1UL << 32) + +extern int verbose_test; +extern char *test_name; +void check_free_huge_pages(int nr_pages_needed); +void check_must_be_root(void); +void check_hugetlb_shm_group(void); +void test_init(int argc, char *argv[]); +int test_addr_huge(void *p); +unsigned long long get_mapping_page_size(void *p); +long read_meminfo(const char *tag); +ino_t get_addr_inode(void *p); + +#define ALIGN(x, a) (((x) + (a) - 1) & ~((a) - 1)) +#define PALIGN(p, a) ((void *)ALIGN((unsigned long)(p), (a))) + +#ifndef barrier +# ifdef mb +# define barrier() mb() +# else +# define barrier() __asm__ __volatile__ ("" : : : "memory") +# endif +#endif + +/* Each test case must define this function */ +void cleanup(void); + +#define verbose_printf(...) \ + if (verbose_test) { \ + printf(__VA_ARGS__); \ + fflush(stdout); \ + } +#define ERR "ERR: " +#define ERROR(fmt, args...) fprintf(stderr, ERR fmt, ## args) + + +#define PASS() \ + do { \ + cleanup(); \ + printf("PASS\n"); \ + exit(RC_PASS); \ + } while (0) + +#define PASS_INCONCLUSIVE() \ + do { \ + cleanup(); \ + printf("PASS (inconclusive)\n"); \ + exit(RC_PASS); \ + } while (0) + +#define IRRELEVANT() \ + do { \ + cleanup(); \ + printf("PASS (irrelevant)\n"); \ + exit(RC_PASS); \ + } while (0) + +/* Look out, gcc extension below... */ +#define FAIL(fmt, ...) \ + do { \ + cleanup(); \ + printf("FAIL\t" fmt "\n", ##__VA_ARGS__); \ + exit(RC_FAIL); \ + } while (0) + +#define CONFIG(fmt, ...) \ + do { \ + cleanup(); \ + printf("Bad configuration: " fmt "\n", ##__VA_ARGS__); \ + exit(RC_CONFIG); \ + } while (0) + +#define TEST_BUG(fmt, ...) \ + do { \ + cleanup(); \ + printf("BUG in testsuite: " fmt "\n", ##__VA_ARGS__); \ + exit(RC_BUG); \ + } while (0) + +/* stressutils.c stuff */ +int remove_shmid(int shmid); + +extern long gethugepagesize (void) __attribute__ ((weak)); + +static inline long check_hugepagesize() +{ + long __hpage_size = gethugepagesize(); + if (__hpage_size < 0) { + if (errno == ENOSYS) + CONFIG("No hugepage kernel support\n"); + else if (errno == EOVERFLOW) + CONFIG("Hugepage size too large"); + else + CONFIG("Hugepage size (%s)", strerror(errno)); + } + return __hpage_size; +} + +int using_system_hpage_size(const char *mount); + +/* WARNING: Racy -- use for test cases only! */ +int kernel_has_private_reservations(void); + +#endif /* _HUGETESTS_H */ diff --git a/default/libhugetlbfs/libhugetlbfs/tests/icache-hygiene.c b/default/libhugetlbfs/libhugetlbfs/tests/icache-hygiene.c new file mode 100644 index 0000000..eb64a62 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/icache-hygiene.c @@ -0,0 +1,215 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* Test rationale: + * + * Older ppc64 kernels don't properly flush dcache to icache before + * giving a cleared page to userspace. With some exceedingly hairy + * code, this attempts to test for this bug. + * + * This test will never trigger (obviously) on machines with coherent + * icache and dcache (including x86 and POWER5). On any given run, + * even on a buggy kernel there's a chance the bug won't trigger - + * either because we don't get the same physical page back when we + * remap, or because the icache happens to get flushed in the interim. + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <setjmp.h> +#include <unistd.h> +#include <signal.h> +#include <sys/mman.h> +#include <ucontext.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +#define COPY_SIZE 128 +#define NUM_REPETITIONS 64 /* Seems to be enough to trigger reliably */ + +static long hpage_size; + +static void cacheflush(void *p) +{ +#ifdef __powerpc__ + asm volatile("dcbst 0,%0; sync; icbi 0,%0; isync" : : "r"(p)); +#endif +} + +static void jumpfunc(int copy, void *p) +{ + /* gcc bug workaround: if there is exactly one &&label + * construct in the function, gcc assumes the computed goto + * goes there, leading to the complete elision of the goto in + * this case */ + void *l = &&dummy; + l = &&jumplabel; + + if (copy) { + memcpy(p, l, COPY_SIZE); + cacheflush(p); + } + + goto *p; + dummy: + printf("unreachable?\n"); + + jumplabel: + return; +} + +static sigjmp_buf sig_escape; +static void *sig_expected; + +static void sig_handler(int signum, siginfo_t *si, void *uc) +{ +#if defined(__powerpc__) || defined(__powerpc64__) || defined(__ia64__) || \ + defined(__s390__) || defined(__s390x__) || defined(__sparc__) + /* On powerpc and ia64 and s390, 0 bytes are an illegal + * instruction, so, if the icache is cleared properly, we SIGILL + * as soon as we jump into the cleared page */ + if (signum == SIGILL) { + verbose_printf("SIGILL at %p (sig_expected=%p)\n", si->si_addr, + sig_expected); + if (si->si_addr == sig_expected) { + siglongjmp(sig_escape, 1); + } + FAIL("SIGILL somewhere unexpected"); + } +#elif defined(__i386__) || defined(__x86_64__) + /* On x86, zero bytes form a valid instruction: + * add %al,(%eax) (i386) + * or add %al,(%rax) (x86_64) + * + * So, behaviour depends on the contents of [ER]AX, which in + * turn depends on the details of code generation. If [ER]AX + * contains a valid pointer, we will execute the instruction + * repeatedly until we run off that hugepage and get a SIGBUS + * on the second, truncated page. If [ER]AX does not contain + * a valid pointer, we will SEGV on the first instruction in + * the cleared page. We check for both possibilities + * below. */ + if (signum == SIGBUS) { + verbose_printf("SIGBUS at %p (sig_expected=%p)\n", si->si_addr, + sig_expected); + if (sig_expected + && (ALIGN((unsigned long)sig_expected, gethugepagesize()) + == (unsigned long)si->si_addr)) { + siglongjmp(sig_escape, 2); + } + FAIL("SIGBUS somewhere unexpected"); + } + if (signum == SIGSEGV) { +#ifdef __x86_64__ + void *pc = (void *)((ucontext_t *)uc)->uc_mcontext.gregs[REG_RIP]; +#else + void *pc = (void *)((ucontext_t *)uc)->uc_mcontext.gregs[REG_EIP]; +#endif + + verbose_printf("SIGSEGV at %p, PC=%p (sig_expected=%p)\n", + si->si_addr, pc, sig_expected); + if (sig_expected == pc) { + siglongjmp(sig_escape, 1); + } + FAIL("SIGSEGV somewhere unexpected"); + } +#else +#error Need to setup signal conditions for this arch +#endif +} + +static void test_once(int fd) +{ + void *p, *q; + int dummy; + + dummy = ftruncate(fd, 0); + + if (sigsetjmp(sig_escape, 1)) { + sig_expected = NULL; + dummy = ftruncate(fd, 0); + return; + } + + p = mmap(NULL, 2*hpage_size, PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_SHARED, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() 1: %s", strerror(errno)); + + dummy = ftruncate(fd, hpage_size); + + q = p + hpage_size - COPY_SIZE; + + jumpfunc(1, q); + + dummy = ftruncate(fd, 0); + p = mmap(p, hpage_size, PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_SHARED|MAP_FIXED, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() 2: %s", strerror(errno)); + + q = p + hpage_size - COPY_SIZE; + sig_expected = q; + + jumpfunc(0, q); /* This should blow up */ + + FAIL("icache unclean"); +} + +int main(int argc, char *argv[]) +{ + int fd; + int err; + int i; + + test_init(argc, argv); + + struct sigaction sa = { + .sa_sigaction = sig_handler, + .sa_flags = SA_SIGINFO, + }; + + hpage_size = check_hugepagesize(); + + err = sigaction(SIGILL, &sa, NULL); + if (err) + FAIL("Can't install SIGILL handler: %s", strerror(errno)); + + err = sigaction(SIGBUS, &sa, NULL); + if (err) + FAIL("Can't install SIGBUS handler: %s", strerror(errno)); + + err = sigaction(SIGSEGV, &sa, NULL); + if (err) + FAIL("Can't install SIGSEGV handler: %s", strerror(errno)); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + CONFIG("Couldn't get hugepage fd"); + + for (i = 0; i < NUM_REPETITIONS; i++) + test_once(fd); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/large_mounts.c b/default/libhugetlbfs/libhugetlbfs/tests/large_mounts.c new file mode 100644 index 0000000..14376e9 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/large_mounts.c @@ -0,0 +1,117 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 Eric Munson, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include <sys/types.h> +#include <sys/stat.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <dlfcn.h> +#include <stdarg.h> +#include <errno.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +#define BUF_SIZE 4096 +#define FILLER "tmpfs /var/run tmpfs rw,nosuid,nodev,noexec,mode=755 0 0\n" + +int in_test; /* = 0; */ +int tmp_mounts_fd; /* = 0; */ +FILE *tmp_stream; /* = NULL; */ + +/* + * We override the normal open, so we can remember the fd for the + * mounts file + */ +int open(const char *path, int flags, ...) +{ + int (*old_open)(const char *, int, ...); + int fd; + va_list ap; + + old_open = dlsym(RTLD_NEXT, "open"); + if (in_test && strcmp(path, "/proc/mounts") == 0) + return tmp_mounts_fd; + va_start(ap, flags); + fd = (old_open)(path, flags, va_arg(ap, mode_t)); + va_end(ap); + return fd; +} + +void make_test_mounts() +{ + char buf[BUF_SIZE]; + int mounts_fd; + unsigned int written = 0; + int ret; + int filler_sz; + + mounts_fd = open("/proc/mounts", O_RDONLY); + if (mounts_fd < 0) + FAIL("Unable to open /proc/mounts: %s", strerror(errno)); + tmp_stream = tmpfile(); + if (!tmp_stream) + FAIL("Unable to open temporary mounts file: %s", strerror(errno)); + + tmp_mounts_fd = fileno(tmp_stream); + if (tmp_mounts_fd < 0) + FAIL("Unable to get file descriptor from stream."); + + filler_sz = strlen(FILLER); + + while (written < BUF_SIZE) { + if (write(tmp_mounts_fd, FILLER, filler_sz) < 0) + FAIL("Unable to write to temp mounts file: %s", + strerror(errno)); + written += filler_sz; + } + + while ((ret = read(mounts_fd, buf, BUF_SIZE)) > 0) + if (write(tmp_mounts_fd, buf, ret) < 0) + FAIL("Unable to write to temp mounts file: %s", + strerror(errno)); + + close(mounts_fd); + if (lseek(tmp_mounts_fd, 0, SEEK_SET) < 0) + FAIL("Unable to move temp mounts stream to beginning of file: %s", + strerror(errno)); +} + +int main(int argc, char *argv[]) +{ + int fd; + + make_test_mounts(); + test_init(argc, argv); + in_test = 1; + + fd = hugetlbfs_unlinked_fd(); + + fclose(tmp_stream); + if (fd < 0) + FAIL("Unable to find mount point\n"); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/libtestutils.c b/default/libhugetlbfs/libhugetlbfs/tests/libtestutils.c new file mode 100644 index 0000000..4eeb880 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/libtestutils.c @@ -0,0 +1,138 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2008 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _LARGEFILE64_SOURCE +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <limits.h> +#include <string.h> +#include <errno.h> +#include <ctype.h> +#include <unistd.h> +#include <signal.h> +#include <sys/types.h> +#include <sys/vfs.h> +#include <sys/ipc.h> +#include <sys/shm.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <fcntl.h> + +#include "hugetlbfs.h" +#include "libhugetlbfs_privutils.h" +#include "hugetests.h" + +void check_free_huge_pages(int nr_pages_needed) +{ + long hpage_size = gethugepagesize(); + int freepages = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + if (freepages < nr_pages_needed) + CONFIG("Must have at least %i free hugepages", nr_pages_needed); +} + +int using_system_hpage_size(const char *mount) +{ + struct statfs64 sb; + int err; + long meminfo_size, mount_size; + + if (!mount) + FAIL("using_system_hpage_size: hugetlbfs is not mounted\n"); + + err = statfs64(mount, &sb); + if (err) + FAIL("statfs64: %s\n", strerror(errno)); + + meminfo_size = read_meminfo("Hugepagesize:"); + if (meminfo_size < 0) + FAIL("using_system_hpage_size: Failed to read /proc/meminfo\n"); + + mount_size = sb.f_bsize / 1024; /* Compare to meminfo in kB */ + if (mount_size == meminfo_size) + return 1; + else + return 0; +} + +/* WARNING: This function relies on the hugetlb pool counters in a way that + * is known to be racy. Due to the expected usage of hugetlbfs test cases, the + * risk of a race is acceptible. This function should NOT be used for real + * applications. + */ +int kernel_has_private_reservations(void) +{ + int fd; + long t, f, r, s; + long nt, nf, nr, ns; + long hpage_size = gethugepagesize(); + void *map; + + /* Read pool counters */ + t = get_huge_page_counter(hpage_size, HUGEPAGES_TOTAL); + f = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + r = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + s = get_huge_page_counter(hpage_size, HUGEPAGES_SURP); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) { + ERROR("kernel_has_private_reservations: hugetlbfs_unlinked_fd: " + "%s\n", strerror(errno)); + return -1; + } + map = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); + if (map == MAP_FAILED) { + ERROR("kernel_has_private_reservations: mmap: %s\n", + strerror(errno)); + return -1; + } + + /* Recheck the counters */ + nt = get_huge_page_counter(hpage_size, HUGEPAGES_TOTAL); + nf = get_huge_page_counter(hpage_size, HUGEPAGES_FREE); + nr = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + ns = get_huge_page_counter(hpage_size, HUGEPAGES_SURP); + + munmap(map, hpage_size); + close(fd); + + /* + * There are only three valid cases: + * 1) If a surplus page was allocated to create a reservation, all + * four pool counters increment + * 2) All counters remain the same except for Hugepages_Rsvd, then + * a reservation was created using an existing pool page. + * 3) All counters remain the same, indicates that no reservation has + * been created + */ + if ((nt == t + 1) && (nf == f + 1) && (ns == s + 1) && (nr == r + 1)) { + return 1; + } else if ((nt == t) && (nf == f) && (ns == s)) { + if (nr == r + 1) + return 1; + else if (nr == r) + return 0; + } else { + ERROR("kernel_has_private_reservations: bad counter state - " + "T:%li F:%li R:%li S:%li -> T:%li F:%li R:%li S:%li\n", + t, f, r, s, nt, nf, nr, ns); + } + return -1; +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/linkhuge.c b/default/libhugetlbfs/libhugetlbfs/tests/linkhuge.c new file mode 100644 index 0000000..05d9924 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/linkhuge.c @@ -0,0 +1,176 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/mman.h> + +#include "hugetests.h" + +#define BLOCK_SIZE 16384 +#define CONST 0xdeadbeef + +#define BIG_INIT { \ + [0] = CONST, [17] = CONST, [BLOCK_SIZE-1] = CONST, \ +} +static int small_data = 1; +static int big_data[BLOCK_SIZE] = BIG_INIT; + +static int small_bss; +static int big_bss[BLOCK_SIZE]; + +const int small_const = CONST; +const int big_const[BLOCK_SIZE] = BIG_INIT; + +static int static_func(int x) +{ + return x; +} + +int global_func(int x) +{ + return x; +} + +static struct test_entry { + const char *name; + void *data; + int size; + char linkchar; + int writable, execable; + int is_huge; +} testtab[] = { +#define RWENT(name, linkchar) { #name, &name, sizeof(name), linkchar, 1, 0, } +#define ROENT(name, linkchar) { #name, (void *)&name, sizeof(name), linkchar, 0, 0, } +#define RXENT(name, linkchar) { #name, &name, sizeof(name), linkchar, 0, 1, } + RWENT(small_data, 'D'), + RWENT(big_data, 'D'), + RWENT(small_bss, 'B'), + RWENT(big_bss, 'B'), + ROENT(small_const, 'T'), + ROENT(big_const, 'T'), + RXENT(static_func, 'T'), + RXENT(global_func, 'T'), +}; + +#define NUM_TESTS (sizeof(testtab) / sizeof(testtab[0])) + +static char link_string[32]; + +static void get_link_string(const char *argv0) +{ + const char *p, *q; + + /* Find program basename */ + p = strrchr(argv0, '/'); + if (p) + p++; + else + p = argv0; + + if (*p != 'x') + return; /* just a plain ordinary link */ + + q = strchr(p, '.'); + if (!q) + /* ERROR? */ + return; + + memcpy(link_string, p, q-p); +} + +static void do_test(struct test_entry *te) +{ + int i; + volatile int *p = te->data; + + if (te->writable) { + for (i = 0; i < (te->size / sizeof(*p)); i++) + p[i] = CONST ^ i; + + barrier(); + + for (i = 0; i < (te->size / sizeof(*p)); i++) + if (p[i] != (CONST ^ i)) + FAIL("mismatch on %s", te->name); + } else if (te->execable) { + int (*pf)(int) = te->data; + + if ((*pf)(CONST) != CONST) + FAIL("%s returns incorrect results", te->name); + } else { + /* Otherwise just read touch it */ + for (i = 0; i < (te->size / sizeof(*p)); i++) + p[i]; + } + + te->is_huge = (test_addr_huge(te->data) == 1); +} + +int main(int argc, char *argv[]) +{ + int i; + char *env; + int elfmap_inhibited; + + test_init(argc, argv); + + get_link_string(argv[0]); + + env = getenv("HUGETLB_ELFMAP"); + + verbose_printf("Link string is [%s], HUGETLB_ELFMAP=%s\n", + link_string, env); + + elfmap_inhibited = env && (strcasecmp(env, "no") == 0); + + for (i = 0; i < NUM_TESTS; i++) { + do_test(testtab + i); + } + + verbose_printf("Hugepages used for:"); + for (i = 0; i < NUM_TESTS; i++) + if (testtab[i].is_huge) + verbose_printf(" %s", testtab[i].name); + verbose_printf("\n"); + + for (i = 0; i < NUM_TESTS; i++) { + char linkchar = testtab[i].linkchar; + + if (elfmap_inhibited) { + if (testtab[i].is_huge) + FAIL("%s is hugepage despite HUGETLB_ELFMAP=%s\n", + testtab[i].name, env); + } else { + if (linkchar && strchr(link_string, linkchar)) { + if (! testtab[i].is_huge) + FAIL("%s is not hugepage\n", + testtab[i].name); + } + if (linkchar && !strchr(link_string, linkchar)) { + if (testtab[i].is_huge) + FAIL("%s is hugepage\n", + testtab[i].name); + } + } + } + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/linkhuge_nofd.c b/default/libhugetlbfs/libhugetlbfs/tests/linkhuge_nofd.c new file mode 100644 index 0000000..f04cd8e --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/linkhuge_nofd.c @@ -0,0 +1,42 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/mman.h> + +#include "hugetests.h" + +/* Override the working version from libhugetlbfs */ +int hugetlbfs_unlinked_fd_for_size(long page_size) +{ + return -1; +} + +int main(int argc, char *argv[]) +{ + test_init(argc, argv); + + /* All we're testing is that we survive the library attempting + * and failing to remap us into hugepages */ + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/linkhuge_rw.c b/default/libhugetlbfs/libhugetlbfs/tests/linkhuge_rw.c new file mode 100644 index 0000000..f58fff2 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/linkhuge_rw.c @@ -0,0 +1,210 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2008 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <elf.h> +#include <link.h> + +#include "hugetests.h" + +#define BLOCK_SIZE 16384 +#define CONST 0xdeadbeef + +#define BIG_INIT { \ + [0] = CONST, [17] = CONST, [BLOCK_SIZE-1] = CONST, \ +} +static int small_data = 1; +static int big_data[BLOCK_SIZE] = BIG_INIT; + +static int small_bss; +static int big_bss[BLOCK_SIZE]; + +const int small_const = CONST; +const int big_const[BLOCK_SIZE] = BIG_INIT; + +static int static_func(int x) +{ + return x; +} + +int global_func(int x) +{ + return x; +} + +static struct test_entry { + const char *name; + void *data; + int size; + int writable, execable; + int is_huge; +} testtab[] = { +#define ENT(name, exec) { #name, (void *)&name, sizeof(name), 0, exec, } + ENT(small_data, 0), + ENT(big_data, 0), + ENT(small_bss, 0), + ENT(big_bss, 0), + ENT(small_const, 0), + ENT(big_const, 0), + + /* + * XXX: Due to the way functions are defined in the powerPC 64-bit ABI, + * the following entries will point to a call stub in the data segment + * instead of to the code as one might think. Therefore, test coverage + * is not quite as good as it could be for ppc64. + */ + ENT(static_func, 1), + ENT(global_func, 1), +}; + +#define NUM_TESTS (sizeof(testtab) / sizeof(testtab[0])) + +static +int parse_elf(struct dl_phdr_info *info, size_t size, void *data) +{ + int i; + unsigned long text_end, data_start; + long *min_align = (long *)data; + long actual_align; + + text_end = data_start = 0; + for (i = 0; i < info->dlpi_phnum; i++) { + if (info->dlpi_phdr[i].p_type != PT_LOAD) + continue; + + if (info->dlpi_phdr[i].p_flags & PF_X) + text_end = info->dlpi_phdr[i].p_vaddr + + info->dlpi_phdr[i].p_memsz; + else if (info->dlpi_phdr[i].p_flags & PF_W) + data_start = info->dlpi_phdr[i].p_vaddr; + + if (text_end && data_start) + break; + } + + actual_align = (data_start - text_end) / 1024; + if (actual_align < *min_align) + FAIL("Binary not suitably aligned"); + + return 1; +} + +static void check_if_writable(struct test_entry *te) +{ + int pid, ret, status; + + + pid = fork(); + if (pid < 0) + FAIL("fork: %s", strerror(errno)); + else if (pid == 0) { + (*(char *) te->data) = 0; + exit (0); + } else { + ret = waitpid(pid, &status, 0); + if (ret < 0) + FAIL("waitpid(): %s", strerror(errno)); + if (WIFSIGNALED(status)) + te->writable = 0; + else + te->writable = 1; + } +} + +static void do_test(struct test_entry *te) +{ + int i; + volatile int *p = te->data; + + check_if_writable(te); + + if (te->writable) { + for (i = 0; i < (te->size / sizeof(*p)); i++) + p[i] = CONST ^ i; + + barrier(); + + for (i = 0; i < (te->size / sizeof(*p)); i++) + if (p[i] != (CONST ^ i)) + FAIL("mismatch on %s", te->name); + } else if (te->execable) { + int (*pf)(int) = te->data; + + if ((*pf)(CONST) != CONST) + FAIL("%s returns incorrect results", te->name); + } else { + /* Otherwise just read touch it */ + for (i = 0; i < (te->size / sizeof(*p)); i++) + p[i]; + } + + te->is_huge = (test_addr_huge(te->data) == 1); +} + +int main(int argc, char *argv[]) +{ + int i; + char *env; + int elfmap_readonly, elfmap_writable; + long hpage_size = gethugepagesize() / 1024; + + test_init(argc, argv); + + /* Test that the binary has been aligned enough by the linker */ + if ((argc > 1) && !strcmp("--test-alignment", argv[1])) + dl_iterate_phdr(parse_elf, &hpage_size); + + env = getenv("HUGETLB_ELFMAP"); + verbose_printf("HUGETLB_ELFMAP=%s\n", env); + + elfmap_readonly = env && strchr(env, 'R'); + elfmap_writable = env && strchr(env, 'W'); + + for (i = 0; i < NUM_TESTS; i++) { + do_test(testtab + i); + } + + verbose_printf("Hugepages used for:"); + for (i = 0; i < NUM_TESTS; i++) + if (testtab[i].is_huge) + verbose_printf(" %s", testtab[i].name); + verbose_printf("\n"); + + for (i = 0; i < NUM_TESTS; i++) { + if (testtab[i].writable) { + if (elfmap_writable && !testtab[i].is_huge) + FAIL("%s is not hugepage", testtab[i].name); + if (!elfmap_writable && testtab[i].is_huge) + FAIL("%s is hugepage", testtab[i].name); + } else if (!testtab[i].writable) { + if (elfmap_readonly && !testtab[i].is_huge) + FAIL("%s is not hugepage", testtab[i].name); + if (!elfmap_readonly && testtab[i].is_huge) + FAIL("%s is hugepage", testtab[i].name); + } + } + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/linkshare.c b/default/libhugetlbfs/libhugetlbfs/tests/linkshare.c new file mode 100644 index 0000000..f86e041 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/linkshare.c @@ -0,0 +1,373 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2006 Nishanth Aravamudan, IBM Corporation + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <time.h> +#include <errno.h> +#include <limits.h> +#include <string.h> +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/shm.h> +#include <sys/wait.h> + +#include "hugetests.h" + +#define BLOCK_SIZE 16384 +#define CONST 0xdeadbeef +#define SHM_KEY 0xdeadcab +#define NUM_CHILDREN 2 + +#define BIG_INIT { \ + [0] = CONST, [17] = CONST, [BLOCK_SIZE-1] = CONST, \ +} +static int small_data = 1; +static int big_data[BLOCK_SIZE] = BIG_INIT; + +static int small_bss; +static int big_bss[BLOCK_SIZE]; + +const int small_const = CONST; +const int big_const[BLOCK_SIZE] = BIG_INIT; + +static int static_func(int x) +{ + return x; +} + +int global_func(int x) +{ + return x; +} + +static struct test_entry { + const char *name; + void *data; + int size; + char linkchar; + int writable, execable; + int is_huge; +} testtab[] = { +#define RWENT(name, linkchar) { #name, &name, sizeof(name), linkchar, 1, 0, } +#define ROENT(name, linkchar) { #name, (void *)&name, sizeof(name), linkchar, 0, 0, } +#define RXENT(name, linkchar) { #name, &name, sizeof(name), linkchar, 0, 1, } + RWENT(small_data, 'D'), + RWENT(big_data, 'D'), + RWENT(small_bss, 'B'), + RWENT(big_bss, 'B'), + ROENT(small_const, 'T'), + ROENT(big_const, 'T'), + RXENT(static_func, 'T'), + RXENT(global_func, 'T'), +}; + +#define NUM_TESTS (sizeof(testtab) / sizeof(testtab[0])) + +static int sharing; +static int elfmap_off; +static int shmid; +static ino_t *shm; + +static char link_string[32]; + +static void get_link_string(const char *argv0) +{ + const char *p, *q; + + /* Find program basename */ + p = strrchr(argv0, '/'); + if (p) + p++; + else + p = argv0; + + if (*p != 'x') + return; /* just a plain ordinary link */ + + q = strchr(p, '.'); + if (!q) + /* ERROR? */ + return; + + memcpy(link_string, p, q-p); +} + +static ino_t do_test(struct test_entry *te) +{ + int i; + volatile int *p = te->data; + + if (te->writable) { + for (i = 0; i < (te->size / sizeof(*p)); i++) + p[i] = CONST ^ i; + + barrier(); + + for (i = 0; i < (te->size / sizeof(*p)); i++) { + if (p[i] != (CONST ^ i)) { + verbose_printf("mismatch on %s", te->name); + exit(RC_FAIL); + } + } + } else if (te->execable) { + int (*pf)(int) = te->data; + + if ((*pf)(CONST) != CONST) { + verbose_printf("%s returns incorrect results", te->name); + exit(RC_FAIL); + } + } else { + /* Otherwise just read touch it */ + for (i = 0; i < (te->size / sizeof(*p)); i++) + p[i]; + } + + te->is_huge = (test_addr_huge(te->data) == 1); + + return get_addr_inode(te->data); +} + +static void parse_env(void) +{ + char *env; + + env = getenv("HUGETLB_ELFMAP"); + if (env && (strcasecmp(env, "no") == 0)) { + verbose_printf("Segment remapping disabled\n"); + elfmap_off = 1; + } else { + env = getenv("HUGETLB_SHARE"); + if (env) + sharing = atoi(env); + verbose_printf("Segment remapping enabled, " + "sharing = %d\n", sharing); + } +} + +static pid_t spawn_child(char *self, int index) +{ + int ret; + char execarg1[5]; + + ret = snprintf(execarg1, 5, "%d", index); + if (ret < 0) + FAIL("snprintf failed: %s", strerror(errno)); + + ret = fork(); + if (ret) { + if (ret < 0) { + shmctl(shmid, IPC_RMID, NULL); + shmdt(shm); + FAIL("fork failed: %s", + strerror(errno)); + } + } else { + ret = execlp(self, self, execarg1, NULL); + if (ret) { + shmctl(shmid, IPC_RMID, NULL); + shmdt(shm); + FAIL("execl(%s, %s, %s failed: %s", + self, self, execarg1, + strerror(errno)); + } + } + + return ret; +} + +static int child_process(char *self, int index) +{ + int i; + ino_t ino; + + get_link_string(self); + + shmid = shmget(SHM_KEY, NUM_CHILDREN * NUM_TESTS * + sizeof(ino_t), 0666); + if (shmid < 0) { + verbose_printf("Child's shmget failed: %s", strerror(errno)); + exit(RC_FAIL); + } + + shm = shmat(shmid, NULL, 0); + if (shm == (void *)-1) { + verbose_printf("Child's shmat failed: %s", strerror(errno)); + exit(RC_FAIL); + } + + for (i = 0; i < NUM_TESTS; i++) { + if (!test_addr_huge(testtab + i)) { + /* don't care about non-huge addresses */ + shm[index * NUM_TESTS + i] = 0; + } else { + ino = do_test(testtab + i); + if ((int)ino < 0) { + shmdt(shm); + exit(RC_FAIL); + } + shm[index * NUM_TESTS + i] = ino; + } + } + shmdt(shm); + return 0; +} + +static void verify_inodes() +{ + int i, j; + + for (i = 0; i < NUM_TESTS; i++) { + ino_t base = shm[i]; + for (j = 1; j < NUM_CHILDREN; j++) { + ino_t comp = shm[j * NUM_TESTS + i]; + if (base != comp) { + /* + * we care if we mismatch if + * sharing only read-only + * segments and this is one + */ + if (sharing == 1 && testtab[i].writable == 0) { + shmctl(shmid, IPC_RMID, NULL); + shmdt(shm); + FAIL("Inodes do not match " + "(%u != %u)", + (int)base, (int)comp); + } + } else { + /* + * we care if we match if + * a) not remapping or + * b) not sharing or + * c) sharing only read-only + * segments and this is not one + * BUT only if the inode is not + * 0 (don't care about the file) + */ + if (base == 0) + continue; + + if (elfmap_off == 1 || sharing == 0 || + (sharing == 1 && testtab[i].writable == 1)) { + shmctl(shmid, IPC_RMID, NULL); + shmdt(shm); + if (sharing == 1 && testtab[i].writable == 1) + verbose_printf("Incorrectly sharing a writable segment...\n"); + FAIL("Inodes match, but we should not be " + "sharing this segment (%d == %d)", + (int)base, (int)comp); + } + } + } + } +} + +static void sigsegv_handler(int signum, siginfo_t *si, void *context) +{ + FAIL("Segmentation fault in parent at address %p", si->si_addr); +} + +int main(int argc, char *argv[], char *envp[]) +{ + test_init(argc, argv); + + if (argc == 1) { + /* + * first process + */ + pid_t children_pids[NUM_CHILDREN]; + int ret, i; + int status; + /* + * We catch children's segfaults via waitpid's status, + * but this is to catch the parent itself segfaulting. + * This can happen, for instance, if an old (bad) + * segment file is left lying around in the hugetlbfs + * mountpoint + */ + struct sigaction sa_seg = { + .sa_sigaction = sigsegv_handler, + .sa_flags = SA_SIGINFO, + }; + + parse_env(); + + ret = sigaction(SIGSEGV, &sa_seg, NULL); + if (ret < 0) + FAIL("Installing SIGSEGV handler failed: %s", + strerror(errno)); + + shmid = shmget(SHM_KEY, NUM_CHILDREN * NUM_TESTS * + sizeof(ino_t), IPC_CREAT | IPC_EXCL | + 0666); + if (shmid < 0) + FAIL("Parent's shmget failed: %s", strerror(errno)); + + shm = shmat(shmid, NULL, 0); + if (shm == (void *)-1) + FAIL("Parent's shmat failed: %s", strerror(errno)); + + for (i = 0; i < NUM_CHILDREN; i++) + children_pids[i] = spawn_child(argv[0], i); + + for (i = 0; i < NUM_CHILDREN; i++) { + ret = waitpid(children_pids[i], &status, 0); + if (ret < 0) { + shmctl(shmid, IPC_RMID, NULL); + shmdt(shm); + FAIL("waitpid failed: %s", strerror(errno)); + } + + if (WIFEXITED(status) && WEXITSTATUS(status) != 0) { + shmctl(shmid, IPC_RMID, NULL); + shmdt(shm); + FAIL("Child %d exited with non-zero status: %d", + i + 1, WEXITSTATUS(status)); + } + + if (WIFSIGNALED(status)) { + shmctl(shmid, IPC_RMID, NULL); + shmdt(shm); + FAIL("Child %d killed by signal: %s", i + 1, + strsignal(WTERMSIG(status))); + } + } + + verify_inodes(); + + shmctl(shmid, IPC_RMID, NULL); + shmdt(shm); + PASS(); + } else { + if (argc == 2) { + /* + * child process + * arg1 = index + 1 into shared memory array + */ + child_process(argv[0], atoi(argv[1])); + } else { + FAIL("Invalid arguments\n"); + } + } + + return 0; +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/madvise_reserve.c b/default/libhugetlbfs/libhugetlbfs/tests/madvise_reserve.c new file mode 100644 index 0000000..2f7bd67 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/madvise_reserve.c @@ -0,0 +1,81 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> +#include "hugetests.h" + +/* + * Test rationale: + * + * madvise() on some kernels can cause the reservation counter to get + * corrupted. The problem is that the patches are allocated for the + * reservation but not faulted in at the time of allocation. The + * counters do not get updated and effectively "leak". This test + * identifies whether the kernel is vunerable to the problem or not. + * It is fixed in kernel by commit f2deae9d4e70793568ef9e85d227abb7bef5b622 + */ +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p; + unsigned long initial_rsvd, map_rsvd, madvise_rsvd, end_rsvd; + + test_init(argc, argv); + + /* Setup */ + hpage_size = check_hugepagesize(); + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + initial_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count before map: %lu\n", initial_rsvd); + + /* mmap a region and record reservations */ + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + map_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count after map: %lu\n", map_rsvd); + + /* madvise the region and record reservations */ + if (madvise(p, hpage_size, MADV_WILLNEED) == -1) + FAIL("madvise(): %s", strerror(errno)); + madvise_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count after madvise: %lu\n", madvise_rsvd); + + /* Free region */ + munmap(p, hpage_size); + close(fd); + end_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count after close(): %lu\n", end_rsvd); + + /* Reserve count should match initial reserve count */ + if (end_rsvd != initial_rsvd) + FAIL("Reserve leaked: %lu != %lu\n", end_rsvd, initial_rsvd); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/madvise_reserve.sh b/default/libhugetlbfs/libhugetlbfs/tests/madvise_reserve.sh new file mode 100755 index 0000000..cfe582d --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/madvise_reserve.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +. wrapper-utils.sh + +# madvise is known broken before 2.6.30 +compare_kvers `uname -r` "2.6.30" +if [ $? -eq 1 ]; then + echo "FAIL (assumed) kernel bug" + exit $RC_FAIL +else + EXP_RC=$RC_PASS + exec_and_check $EXP_RC madvise_reserve "$@" +fi + diff --git a/default/libhugetlbfs/libhugetlbfs/tests/malloc.c b/default/libhugetlbfs/libhugetlbfs/tests/malloc.c new file mode 100644 index 0000000..a1af5e1 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/malloc.c @@ -0,0 +1,87 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/mman.h> + +#include "hugetests.h" + +/* + * We cannot test mapping size against huge page size because we are not linked + * against libhugetlbfs so gethugepagesize() won't work. So instead we define + * our MIN_PAGE_SIZE as 64 kB (the largest base page available) and make sure + * the mapping page size is larger than this. + */ +#define MIN_PAGE_SIZE 65536 + +static int block_sizes[] = { + sizeof(int), 1024, 128*1024, 1024*1024, 16*1024*1024, + 32*1024*1024, +}; +#define NUM_SIZES (sizeof(block_sizes) / sizeof(block_sizes[0])) + +int main(int argc, char *argv[]) +{ + int i; + char *env1, *env2, *exe; + int expect_hugepage = 0; + char *p; + + test_init(argc, argv); + exe = strrchr(test_name, '/'); + if (exe) + exe++; /* skip over "/" */ + else + exe = test_name; + + env1 = getenv("HUGETLB_MORECORE"); + verbose_printf("HUGETLB_MORECORE=%s\n", env1); + env2 = getenv("HUGETLB_RESTRICT_EXE"); + verbose_printf("HUGETLB_RESTRICT_EXE=%s\n", env2); + if (env1 && (!env2 || strstr(env2, exe))) + expect_hugepage = 1; + verbose_printf("expect_hugepage=%d\n", expect_hugepage); + + for (i = 0; i < NUM_SIZES; i++) { + int size = block_sizes[i]; + unsigned long long mapping_size; + + p = malloc(size); + if (! p) + FAIL("malloc()"); + + verbose_printf("malloc(%d) = %p\n", size, p); + + memset(p, 0, size); + + mapping_size = get_mapping_page_size(p); + + if (expect_hugepage && (mapping_size <= MIN_PAGE_SIZE)) + FAIL("Address is not hugepage"); + if (!expect_hugepage && (mapping_size > MIN_PAGE_SIZE)) + FAIL("Address is unexpectedly huge"); + + free(p); + } + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/malloc_manysmall.c b/default/libhugetlbfs/libhugetlbfs/tests/malloc_manysmall.c new file mode 100644 index 0000000..25086a8 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/malloc_manysmall.c @@ -0,0 +1,76 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/mman.h> + +#include "hugetests.h" + +/* + * We cannot test mapping size against huge page size because we are not linked + * against libhugetlbfs so gethugepagesize() won't work. So instead we define + * our MIN_PAGE_SIZE as 64 kB (the largest base page available) and make sure + * the mapping page size is larger than this. + */ +#define MIN_PAGE_SIZE 65536 + +#define ALLOC_SIZE (128) +#define NUM_ALLOCS (262144) + +int main(int argc, char *argv[]) +{ + int i; + char *env; + char *p; + int expect_hugepage = 0; + + test_init(argc, argv); + + env = getenv("HUGETLB_MORECORE"); + verbose_printf("HUGETLB_MORECORE=%s\n", env); + if (env) + expect_hugepage = 1; + + for (i = 0; i < NUM_ALLOCS; i++) { + p = malloc(ALLOC_SIZE); + if (! p) + FAIL("malloc()"); + + if (i < 16) + verbose_printf("p = %p\n", p); + + memset(p, 0, ALLOC_SIZE); + + if ((i % 157) == 0) { + /* With this many allocs, testing every one + * takes forever */ + unsigned long long mapping_size = + get_mapping_page_size(p); + if (expect_hugepage && (mapping_size <= MIN_PAGE_SIZE)) + FAIL("Address is not hugepage"); + if (!expect_hugepage && (mapping_size > MIN_PAGE_SIZE)) + FAIL("Address is unexpectedly huge"); + } + } + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/map_high_truncate_2.c b/default/libhugetlbfs/libhugetlbfs/tests/map_high_truncate_2.c new file mode 100644 index 0000000..daabd00 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/map_high_truncate_2.c @@ -0,0 +1,100 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#define _LARGEFILE64_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <signal.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +/* + * Test rationale: + * + * At one stage, a misconversion of hugetlb_vmtruncate_list to a + * prio_tree meant that on 32-bit machines, certain combinations of + * mapping and truncations could truncate incorrect pages, or + * overwrite pmds from other VMAs, triggering BUG_ON()s or other + * wierdness. + * + * Test adapted to the libhugetlbfs framework from an example by + * Kenneth Chen <kenneth.w.chen@xxxxxxxxx> + * + * WARNING: The offsets and addresses used within are specifically + * calculated to trigger the bug as it existed. Don't mess with them + * unless you *really* know what you're doing. + * + * The kernel bug in question was fixed with commit + * 856fc29505556cf263f3dcda2533cf3766c14ab6. + */ +#define MAP_LENGTH (4 * hpage_size) +#define TRUNCATE_POINT 0x60000000UL +#define HIGH_ADDR 0xa0000000UL + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + char *p, *q; + unsigned long i; + int err; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + check_free_huge_pages(4); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + /* First mapping */ + p = mmap(0, MAP_LENGTH + TRUNCATE_POINT, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_NORESERVE, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() 1: %s", strerror(errno)); + + munmap(p, 4*hpage_size + TRUNCATE_POINT); + + q = mmap((void *)HIGH_ADDR, MAP_LENGTH, PROT_READ | PROT_WRITE, + MAP_PRIVATE, fd, 0); + if (q == MAP_FAILED) + FAIL("mmap() 2: %s", strerror(errno)); + + verbose_printf("High map at %p\n", q); + + for (i = 0; i < MAP_LENGTH; i += hpage_size) + q[i] = 1; + + err = ftruncate(fd, TRUNCATE_POINT); + if (err != 0) + FAIL("ftruncate(): %s", strerror(errno)); + + if (q[0] != 1) + FAIL("data mismatch"); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/meminfo_nohuge.c b/default/libhugetlbfs/libhugetlbfs/tests/meminfo_nohuge.c new file mode 100644 index 0000000..7cbc624 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/meminfo_nohuge.c @@ -0,0 +1,79 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <dlfcn.h> +#include <stdarg.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +/* We override the normal open, so libhugetlbfs gets a /proc/meminfo + * which doesn't contain any hugepage information */ +int open(const char *path, int flags, ...) +{ + int (*old_open)(const char *, int, ...); + int fd; + + if (strcmp(path, "/proc/meminfo") == 0) { + FILE *f; + + f = popen("/bin/grep -vi ^hugepage /proc/meminfo", "r"); + return fileno(f); + } + + if (strcmp(path, "/proc/mounts") == 0) { + FILE *f; + + f = popen("/bin/grep -vi hugetlbfs /proc/mounts", "r"); + return fileno(f); + } + + old_open = dlsym(RTLD_NEXT, "open"); + if (flags & O_CREAT) { + va_list ap; + + va_start(ap, flags); + fd = (*old_open)(path, flags, va_arg(ap, mode_t)); + va_end(ap); + return fd; + } else { + return (*old_open)(path, flags); + } +} + +int main(int argc, char *argv[]) +{ + long hpage_size; + + test_init(argc, argv); + + hpage_size = gethugepagesize(); + if (hpage_size == -1) + PASS(); + + FAIL("Mysteriously found a hugepage size of %ld\n", hpage_size); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/misalign.c b/default/libhugetlbfs/libhugetlbfs/tests/misalign.c new file mode 100644 index 0000000..de85be6 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/misalign.c @@ -0,0 +1,121 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2007 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <signal.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +/* + * Test rationale: + * + * Just as normal mmap()s can't have an address, length or offset + * which is not page aligned, so hugepage mmap()s can't have an + * address, length or offset with is not hugepage aligned. + * + * However, from time to time when the various mmap() / + * get_unmapped_area() paths are updated, somebody misses one of the + * necessary checks for the hugepage paths. This testcase ensures + * that attempted hugepage mappings with parameters which are not + * correctly hugepage aligned are rejected. + */ +int main(int argc, char *argv[]) +{ + long page_size, hpage_size; + int fd; + void *p, *q; + int err; + + test_init(argc, argv); + + page_size = getpagesize(); + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + /* First see what an ok mapping looks like, as a basis for our + * bad addresses and so forth */ + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() without hint failed: %s", strerror(errno)); + if (((unsigned long)p % hpage_size) != 0) + FAIL("mmap() without hint at misaligned address"); + + verbose_printf("Mapped at %p, length 0x%lx\n", p, hpage_size); + + err = munmap(p, hpage_size); + if (err != 0) + FAIL("munmap() without hint failed: %s", strerror(errno)); + + /* 1) Try a misaligned hint address */ + q = mmap(p + page_size, hpage_size, PROT_READ|PROT_WRITE, + MAP_PRIVATE, fd, 0); + if (q == MAP_FAILED) + /* Bad hint shouldn't fail, just ignore the hint */ + FAIL("mmap() with hint failed: %s", strerror(errno)); + if (((unsigned long)q % hpage_size) != 0) + FAIL("mmap() with hint at misaligned address"); + + err = munmap(q, hpage_size); + if (err != 0) + FAIL("munmap() with hint failed: %s", strerror(errno)); + + /* 2) Try a misaligned address with MAP_FIXED */ + q = mmap(p + page_size, hpage_size, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_FIXED, fd, 0); + if (q != MAP_FAILED) + FAIL("mmap() MAP_FIXED at misaligned address succeeded"); + + /* 3) Try a misaligned length */ + q = mmap(NULL, page_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); + if (q != MAP_FAILED) + FAIL("mmap() with misaligned length 0x%lx succeeded", + page_size); + + /* 4) Try a misaligned length with MAP_FIXED */ + q = mmap(p, page_size, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_FIXED, fd, 0); + if (q != MAP_FAILED) + FAIL("mmap() MAP_FIXED with misaligned length 0x%lx succeeded", + page_size); + + /* 5) Try a misaligned offset */ + q = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, + MAP_PRIVATE, fd, page_size); + if (q != MAP_FAILED) + FAIL("mmap() with misaligned offset 0x%lx succeeded", + page_size); + + /* 6) Try a misaligned offset with MAP_FIXED*/ + q = mmap(p, hpage_size, PROT_READ|PROT_WRITE, + MAP_PRIVATE|MAP_FIXED, fd, page_size); + if (q != MAP_FAILED) + FAIL("mmap() MAP_FIXED with misaligned offset 0x%lx succeeded", + page_size); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/misaligned_offset.c b/default/libhugetlbfs/libhugetlbfs/tests/misaligned_offset.c new file mode 100644 index 0000000..e82ffe1 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/misaligned_offset.c @@ -0,0 +1,140 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * Copyright (C) 2006 Hugh Dickins <hugh@xxxxxxxxxxx> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <signal.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +/* + * Test rationale: + * + * At one stage, a misconversion of hugetlb_vmtruncate_list to a + * prio_tree meant that on 32-bit machines, truncates at or above 4GB + * could truncate lower pages, resulting in BUG_ON()s. + * + * WARNING: The offsets and addresses used within are specifically + * calculated to trigger the bug as it existed. Don't mess with them + * unless you *really* know what you're doing. + * + * The kernel bug in question was fixed with commit + * 856fc29505556cf263f3dcda2533cf3766c14ab6. + */ + +#define RANDOM_CONSTANT 0x1234ABCD + +int main(int argc, char *argv[]) +{ + int page_size; + long hpage_size; + off_t buggy_offset; + int fd; + void *p, *q; + volatile int *pi; + int err; + + test_init(argc, argv); + + page_size = getpagesize(); + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + /* First, we make a 2 page sane hugepage mapping. Then we + * memset() it to ensure that the ptes are instantiated for + * it. Then we attempt to replace the second half of the map + * with one at a bogus offset. We leave the first page of + * sane mapping in place to ensure that the corresponding + * pud/pmd/whatever entries aren't cleaned away. It's those + * bad entries which can trigger bad_pud() checks if the + * backout path for the bogus mapping is buggy, which it was + * in some kernels. */ + + verbose_printf("Free hugepages: %lu\n", + get_huge_page_counter(hpage_size, HUGEPAGES_FREE)); + + verbose_printf("Mapping reference map..."); + /* First get arena of three hpages size, at file offset 4GB */ + p = mmap(NULL, 2*hpage_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() offset 4GB: %s", strerror(errno)); + verbose_printf("%p-%p\n", p, p+2*hpage_size-1); + + verbose_printf("Free hugepages: %lu\n", + get_huge_page_counter(hpage_size, HUGEPAGES_FREE)); + + /* Instantiate the pages */ + verbose_printf("Instantiating..."); + memset(p, 0, 2*hpage_size); + pi = p; + *pi = RANDOM_CONSTANT; + verbose_printf("done.\n"); + + verbose_printf("Free hugepages: %lu\n", + get_huge_page_counter(hpage_size, HUGEPAGES_FREE)); + + /* Toggle the permissions on the first page. This forces TLB + * entries (including hash page table on powerpc) to be + * flushed, so that the page tables must be accessed for the + * test further down. In the buggy case, those page tables + * can get thrown away by a pud_clear() */ + err = mprotect(p, hpage_size, PROT_READ); + if (err) + FAIL("mprotect(%p, 0x%lx, PROT_READ): %s", p, hpage_size, + strerror(errno)); + + /* Replace top hpage by hpage mapping at confusing file offset */ + buggy_offset = page_size; + verbose_printf("Replacing map at %p with map from offset 0x%lx...", + p + hpage_size, (unsigned long)buggy_offset); + q = mmap(p + hpage_size, hpage_size, PROT_READ|PROT_WRITE, + MAP_FIXED|MAP_PRIVATE, fd, buggy_offset); + if (q != MAP_FAILED) + FAIL("bogus offset mmap() succeeded at %p: %s", q, strerror(errno)); + if (errno != EINVAL) + FAIL("bogus mmap() failed with \"%s\" instead of \"%s\"", + strerror(errno), strerror(EINVAL)); + verbose_printf("%s\n", strerror(errno)); + + verbose_printf("Free hugepages: %lu\n", + get_huge_page_counter(hpage_size, HUGEPAGES_FREE)); + + if (*pi != RANDOM_CONSTANT) + FAIL("Pre-existing mapping clobbered: %x instead of %x", + *pi, RANDOM_CONSTANT); + + verbose_printf("Free hugepages: %lu\n", + get_huge_page_counter(hpage_size, HUGEPAGES_FREE)); + + /* The real test is whether we got a bad_pud() or similar + * during the run. The check above, combined with the earlier + * mprotect()s to flush the TLB are supposed to catch it, but + * it's hard to be certain. Once bad_pud() is called + * behaviour can be very strange. */ + PASS_INCONCLUSIVE(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/mlock.c b/default/libhugetlbfs/libhugetlbfs/tests/mlock.c new file mode 100644 index 0000000..88859f3 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/mlock.c @@ -0,0 +1,72 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> +#include <sys/mman.h> +#include <sys/resource.h> + +#include <hugetlbfs.h> +#include "hugetests.h" + +static void test_simple_mlock(int flags) +{ + int fd = hugetlbfs_unlinked_fd(); + void *p; + int ret; + long hpage_size = check_hugepagesize(); + + p = mmap(0, hpage_size, PROT_READ|PROT_WRITE, flags, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() failed (flags=%x): %s", flags, strerror(errno)); + + ret = mlock(p, hpage_size); + if (ret) + FAIL("mlock() failed (flags=%x): %s", flags, strerror(errno)); + + ret = munlock(p, hpage_size); + if (ret) + FAIL("munlock() failed (flags=%x): %s", flags, strerror(errno)); + + ret = munmap(p, hpage_size); + if (ret) + FAIL("munmap() failed (flags=%x): %s", flags, strerror(errno)); + + close(fd); +} + +int main(int argc, char *argv[]) +{ + struct rlimit limit_info; + if(getrlimit(RLIMIT_MEMLOCK, &limit_info)) + ERROR("Unable to read locked memory rlimit: %s", strerror(errno)); + if(limit_info.rlim_cur < check_hugepagesize()) + CONFIG("Locked memory ulimit set below huge page size"); + + test_simple_mlock(MAP_PRIVATE); + test_simple_mlock(MAP_SHARED); + test_simple_mlock(MAP_PRIVATE|MAP_LOCKED); + test_simple_mlock(MAP_SHARED|MAP_LOCKED); + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/mmap-cow.c b/default/libhugetlbfs/libhugetlbfs/tests/mmap-cow.c new file mode 100644 index 0000000..a7d3a86 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/mmap-cow.c @@ -0,0 +1,182 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#define _GNU_SOURCE + +#include <sys/types.h> +#include <sys/shm.h> +#include <sys/wait.h> +#include <sys/mman.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <hugetlbfs.h> +#include "hugetests.h" + +extern int errno; + +#define P "mmap-cow" +#define DESC \ + "* Tests copy-on-write semantics of large pages where a number *\n"\ + "* of threads map the same file with the MAP_PRIVATE flag. The *\n"\ + "* threads then write into their copy of the mapping and recheck *\n"\ + "* the contents to ensure they were not corrupted by the other *\n"\ + "* threads. *"\ + +#define HTLB_FILE "mmap-cow" +#define BUF_SZ 256 + +#define CHILD_FAIL(thread, fmt, ...) \ + do { \ + verbose_printf("Thread %d (pid=%d) FAIL: " fmt, \ + thread, getpid(), __VA_ARGS__); \ + exit(1); \ + } while (0) + +/* Setup Configuration */ +static int nr_hugepages; /* Number of huge pages to allocate */ +static unsigned int threads; /* Number of threads to run */ + +static int mmap_file(int fd, char **addr, size_t size, int type) +{ + int flags = 0; + + *addr = mmap(NULL, size, PROT_READ|PROT_WRITE, flags | type, fd, 0); + if (*addr == MAP_FAILED) + return -1; + + return 0; +} + +static void do_work(int thread, size_t size, int fd) +{ + char *addr; + size_t i; + char pattern = thread+65; + + if (mmap_file(fd, &addr, size, MAP_PRIVATE)) + CHILD_FAIL(thread, "mmap() failed: %s", strerror(errno)); + + verbose_printf("Thread %d (pid=%d): Mapped at address %p\n", + thread, getpid(), addr); + + /* Write to the mapping with a distinct pattern */ + verbose_printf("Thread %d (pid=%d): Writing %c to the mapping\n", + thread, getpid(), pattern); + for (i = 0; i < size; i++) + memcpy((char *)addr+i, &pattern, 1); + + if (msync(addr, size, MS_SYNC)) + CHILD_FAIL(thread, "msync() failed: %s", strerror(errno)); + + /* Verify the pattern */ + for (i = 0; i < size; i++) + if (addr[i] != pattern) + CHILD_FAIL(thread, "Corruption at %p; " + "Got %c, Expected %c", + &addr[i], addr[i], pattern); + + verbose_printf("Thread %d (pid=%d): Pattern verified\n", + thread, getpid()); + + /* Munmap the area */ + munmap(addr, size); + close(fd); + exit(0); +} + +int main(int argc, char ** argv) +{ + char *addr; + long hpage_size; + size_t size; + int i, pid, status, fd, ret; + pid_t *wait_list; + + test_init(argc, argv); + + if (argc < 3) + CONFIG("Usage: mmap-cow <# threads> <# pages>\n"); + + nr_hugepages = atoi(argv[2]); + threads = atoi(argv[1]); + + if ((threads+1) > nr_hugepages) + CONFIG("Need more hugepages than threads\n"); + + wait_list = malloc(threads * sizeof(pid_t)); + if (wait_list == NULL) + CONFIG("Couldn't allocate memory for wait_list\n"); + + hpage_size = check_hugepagesize(); + /* Have to have enough available hugepages for each thread to + * get its own copy, plus one for the parent/page-cache */ + size = (nr_hugepages / (threads+1)) * hpage_size; + verbose_printf("hpage_size is %lx, Size is %zu, threads: %u\n", + hpage_size, size, threads); + + /* First, open the file */ + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + CONFIG("hugetlbfs_unlinked_fd() failed: %s\n", + strerror(errno)); + + /* First, mmap the file with MAP_SHARED and fill with data + * If this is not done, then the fault handler will not be + * called in the kernel since private mappings will be + * created for the children at prefault time. + */ + if (mmap_file(fd, &addr, size, MAP_SHARED)) + FAIL("Failed to create shared mapping: %s", strerror(errno)); + + for (i = 0; i < size; i += 8) { + memcpy(addr+i, "deadbeef", 8); + } + + for (i=0; i<threads; i++) { + if ((pid = fork()) < 0) + FAIL("fork: %s", strerror(errno)); + + if (pid == 0) + do_work(i, size, fd); + + wait_list[i] = pid; + } + for (i=0; i<threads; i++) { + ret = waitpid(wait_list[i], &status, 0); + if (ret < 0) + FAIL("waitpid(): %s", strerror(errno)); + if (WEXITSTATUS(status) != 0) + FAIL("Thread %d (pid=%d) failed", i, wait_list[i]); + + if (WIFSIGNALED(status)) + FAIL("Thread %d (pid=%d) received unhandled signal", i, + wait_list[i]); + } + + munmap(addr, size); + close(fd); + free(wait_list); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/mmap-gettest.c b/default/libhugetlbfs/libhugetlbfs/tests/mmap-gettest.c new file mode 100644 index 0000000..5f607f7 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/mmap-gettest.c @@ -0,0 +1,127 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <sys/types.h> +#include <sys/shm.h> +#include <sys/types.h> +#include <sys/mman.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <hugetlbfs.h> +#include "hugetests.h" + +#define P "mmap-gettest" +#define DESC \ + "* This baseline test validates that a mapping of a certain size *\n"\ + "* can be created, correctly. Once created, all the pages are *\n"\ + "* filled with a pattern and rechecked to test for corruption. *\n"\ + "* The mapping is then released. This process is repeated for a *\n"\ + "* specified number of iterations. *" + +extern int errno; + +#define BUF_SZ 256 + +/* Global test configuration */ +#define HTLB_FILE "mmap-gettest" +static char hugetlb_mount[BUF_SZ]; +static unsigned int iter; +static int nr_hugepages; +static long hpage_size; + +static int do_one(char *mountpoint, size_t size) { + char *ma; + int fha; + size_t i,j; + char pattern = 'A'; + + fha = hugetlbfs_unlinked_fd(); + if (fha < 0) + CONFIG("Unable to open temp file in hugetlbfs (%s)", + strerror(errno)); + + /* Map the files with MAP_PRIVATE */ + ma = mmap(NULL, size, (PROT_READ|PROT_WRITE), MAP_SHARED, fha, 0); + if (ma == MAP_FAILED) + FAIL("Failed to mmap the hugetlb file: %s", strerror(errno)); + + /* Make sure the page is zeroed */ + for (i = 0; i < nr_hugepages; i++) { + verbose_printf("Verifying %p\n", (ma+(i*hpage_size))); + for (j = 0; j < hpage_size; j++) { + if (*(ma+(i*hpage_size)+j) != 0) + FAIL("Verifying the mmap area failed. " + "Got %c, expected 0", + *(ma+(i*hpage_size)+j)); + } + } + /* Fill each file with a pattern */ + for (i = 0; i < nr_hugepages; i++) { + pattern = 65+(i%26); + verbose_printf("Touching %p with %c\n", ma+(i*hpage_size),pattern); + memset(ma+(i*hpage_size), pattern, hpage_size); + } + + /* Verify the pattern */ + for (i = 0; i < nr_hugepages; i++) { + pattern = 65+(i%26); + verbose_printf("Verifying %p\n", (ma+(i*hpage_size))); + for (j = 0; j < hpage_size; j++) { + if (*(ma+(i*hpage_size)+j) != pattern) + FAIL("Verifying the mmap area failed. " + "Got %c, expected %c", + *(ma+(i*hpage_size)+j),pattern); + } + } + + /* Munmap the area */ + munmap(ma, size); + + /* Close and delete the file */ + close(fha); + return 0; +} + +int main(int argc, char ** argv) +{ + size_t size; + int i; + + test_init(argc, argv); + + if (argc < 3) + CONFIG("Usage: %s <# iterations> <# pages>\n", argv[0]); + + iter = atoi(argv[1]); + nr_hugepages = atoi(argv[2]); + + hpage_size = check_hugepagesize(); + size = nr_hugepages * hpage_size; + + for (i=0; i < iter; i++) { + verbose_printf("Iteration %d\n", i); + do_one(hugetlb_mount, size); + } + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/mprotect.c b/default/libhugetlbfs/libhugetlbfs/tests/mprotect.c new file mode 100644 index 0000000..aa4673e --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/mprotect.c @@ -0,0 +1,217 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <sys/mman.h> +#include <setjmp.h> +#include <signal.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +static sigjmp_buf sig_escape; +static void *sig_expected = MAP_FAILED; +static long hpage_size; + +static void sig_handler(int signum, siginfo_t *si, void *uc) +{ + if (signum == SIGSEGV) { + verbose_printf("SIGSEGV at %p (sig_expected=%p)\n", si->si_addr, + sig_expected); + if (si->si_addr == sig_expected) { + siglongjmp(sig_escape, 1); + } + FAIL("SIGSEGV somewhere unexpected"); + } + FAIL("Unexpected signal %s", strsignal(signum)); +} + +static int test_read(void *p) +{ + volatile unsigned long *pl = p; + unsigned long x; + + if (sigsetjmp(sig_escape, 1)) { + /* We got a SEGV */ + sig_expected = MAP_FAILED; + return -1; + } + + sig_expected = p; + barrier(); + x = *pl; + barrier(); + sig_expected = MAP_FAILED; + + return 0; +} + +static int test_write(void *p, unsigned long val) +{ + volatile unsigned long *pl = p; + unsigned long x; + + if (sigsetjmp(sig_escape, 1)) { + /* We got a SEGV */ + sig_expected = MAP_FAILED; + return -1; + } + + sig_expected = p; + barrier(); + *pl = val; + x = *pl; + barrier(); + sig_expected = MAP_FAILED; + + return (x != val); +} + +#define RANDOM_CONSTANT 0x1234ABCD + +static void test_prot(void *p, int prot) +{ + int r, w; + + verbose_printf("Reading.."); + r = test_read(p); + verbose_printf("%d\n", r); + verbose_printf("Writing.."); + w = test_write(p, RANDOM_CONSTANT); + verbose_printf("%d\n", w); + + if (prot & PROT_READ) { + if (r != 0) + FAIL("read failed on mmap(prot=%x)", prot); + } else { + if (r != -1) + FAIL("read succeeded on mmap(prot=%x)", prot); + } + + if (prot & PROT_WRITE) { + switch (w) { + case -1: + FAIL("write failed on mmap(prot=%x)", prot); + break; + case 0: + break; + case 1: + FAIL("write mismatch on mmap(prot=%x)", prot); + break; + default: + TEST_BUG(); + } + } else { + switch (w) { + case -1: + break; + case 0: + FAIL("write succeeded on mmap(prot=%x)", prot); + break; + case 1: + FAIL("write mismatch on mmap(prot=%x)", prot); + break; + default: + TEST_BUG(); + } + } +} + +static void test_mprotect(int fd, char *testname, + unsigned long len1, int prot1, + unsigned long len2, int prot2) +{ + void *p; + int err; + + verbose_printf("Testing %s\n", testname); + verbose_printf("Mapping with prot=%x\n", prot1); + p = mmap(NULL, len1, prot1, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) + FAIL("%s: mmap(prot=%x): %s", testname, prot1, + strerror(errno)); + + test_prot(p, prot1); + + verbose_printf("mprotect()ing to prot=%x\n", prot2); + err = mprotect(p, len2, prot2); + if (err != 0) + FAIL("%s: mprotect(prot=%x): %s", testname, prot2, + strerror(errno)); + + test_prot(p, prot2); + + if (len2 < len1) + test_prot(p + len2, prot1); + + munmap(p, len1); +} + +int main(int argc, char *argv[]) +{ + int err; + int fd; + void *p; + + test_init(argc, argv); + + struct sigaction sa = { + .sa_sigaction = sig_handler, + .sa_flags = SA_SIGINFO, + }; + + err = sigaction(SIGSEGV, &sa, NULL); + if (err) + FAIL("Can't install SIGSEGV handler: %s", strerror(errno)); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + verbose_printf("instantiating page\n"); + + p = mmap(NULL, 2*hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + memset(p, 0, hpage_size); + munmap(p, hpage_size); + + /* Basic protection change tests */ + test_mprotect(fd, "R->RW", hpage_size, PROT_READ, + hpage_size, PROT_READ|PROT_WRITE); + test_mprotect(fd, "RW->R", hpage_size, PROT_READ|PROT_WRITE, + hpage_size, PROT_READ); + + /* Tests which require VMA splitting */ + test_mprotect(fd, "R->RW 1/2", 2*hpage_size, PROT_READ, + hpage_size, PROT_READ|PROT_WRITE); + test_mprotect(fd, "RW->R 1/2", 2*hpage_size, PROT_READ|PROT_WRITE, + hpage_size, PROT_READ); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/mremap-expand-slice-collision.c b/default/libhugetlbfs/libhugetlbfs/tests/mremap-expand-slice-collision.c new file mode 100644 index 0000000..0cbff15 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/mremap-expand-slice-collision.c @@ -0,0 +1,188 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2009 David Gibson, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +#define RANDOM_CONSTANT 0x1234ABCD + +#ifdef __LP64__ +#define SLICE_BOUNDARY 0x20000000000 +#else +#define SLICE_BOUNDARY 0xe0000000 +#endif + +long hpage_size, page_size; + +void do_readback(void *p, size_t size, const char *stage) +{ + unsigned int *q = p; + int i; + + verbose_printf("do_readback(%p, 0x%lx, \"%s\")\n", p, + (unsigned long)size, stage); + + for (i = 0; i < (size / sizeof(*q)); i++) { + q[i] = RANDOM_CONSTANT ^ i; + } + + for (i = 0; i < (size / sizeof(*q)); i++) { + if (q[i] != (RANDOM_CONSTANT ^ i)) + FAIL("Stage \"%s\": Mismatch at offset 0x%x: 0x%x instead of 0x%x", + stage, i, q[i], RANDOM_CONSTANT ^ i); + } +} + +void do_remap(int fd, void *target) +{ + void *a, *b; + int rc; + + a = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (a == MAP_FAILED) + FAIL("mmap(huge page): %s", strerror(errno)); + + verbose_printf("Huge base mapping at %p\n", a); + + do_readback(a, hpage_size, "base huge"); + + verbose_printf("Attempting mremap(MAYMOVE|FIXED) to %p...", target); + + b = mremap(a, hpage_size, hpage_size, MREMAP_MAYMOVE | MREMAP_FIXED, + target); + + if (b != MAP_FAILED) { + verbose_printf("testing..."); + do_readback(b, hpage_size, "remapped"); + verbose_printf("ok\n"); + } else { + verbose_printf("disallowed (%s)\n", strerror(errno)); + } + + rc = munmap(b, hpage_size); + if (rc != 0) + FAIL("munmap(after remap): %s", strerror(errno)); +} + +int main(int argc, char *argv[]) +{ + int fd, rc; + void *p, *q, *r; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + page_size = getpagesize(); + + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + /* First, hugepages above, normal below */ + p = mmap((void *)(SLICE_BOUNDARY + hpage_size), hpage_size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(huge above): %s", strerror(errno)); + + do_readback(p, hpage_size, "huge above"); + + q = mmap((void *)(SLICE_BOUNDARY - page_size), page_size, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS | MAP_FIXED, -1, 0); + if (q == MAP_FAILED) + FAIL("mmap(normal below): %s", strerror(errno)); + + do_readback(q, page_size, "normal below"); + + verbose_printf("Attempting to remap..."); + + r = mremap(q, page_size, 2*page_size, 0); + if (r == MAP_FAILED) { + verbose_printf("disallowed\n"); + rc = munmap(q, page_size); + if (rc != 0) + FAIL("munmap(normal below): %s", strerror(errno)); + } else { + if (r != q) + FAIL("mremap() moved without MREMAP_MAYMOVE!?"); + + verbose_printf("testing..."); + do_readback(q, 2*page_size, "normal below expanded"); + rc = munmap(q, 2*page_size); + if (rc != 0) + FAIL("munmap(normal below expanded): %s", strerror(errno)); + } + + rc = munmap(p, hpage_size); + if (rc != 0) + FAIL("munmap(huge above)"); + + /* Next, normal pages above, huge below */ + p = mmap((void *)(SLICE_BOUNDARY + hpage_size), page_size, + PROT_READ|PROT_WRITE, + MAP_SHARED | MAP_FIXED | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) + FAIL("mmap(normal above): %s", strerror(errno)); + + do_readback(p, page_size, "normal above"); + + q = mmap((void *)(SLICE_BOUNDARY - hpage_size), + hpage_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, fd, 0); + if (q == MAP_FAILED) + FAIL("mmap(huge below): %s", strerror(errno)); + + do_readback(q, hpage_size, "huge below"); + + verbose_printf("Attempting to remap..."); + + r = mremap(q, hpage_size, 2*hpage_size, 0); + if (r == MAP_FAILED) { + verbose_printf("disallowed\n"); + rc = munmap(q, hpage_size); + if (rc != 0) + FAIL("munmap(huge below): %s", strerror(errno)); + } else { + if (r != q) + FAIL("mremap() moved without MREMAP_MAYMOVE!?"); + + verbose_printf("testing..."); + do_readback(q, 2*hpage_size, "huge below expanded"); + rc = munmap(q, 2*hpage_size); + if (rc != 0) + FAIL("munmap(huge below expanded): %s", strerror(errno)); + } + + rc = munmap(p, page_size); + if (rc != 0) + FAIL("munmap(normal above)"); + + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/mremap-expand-slice-collision.sh b/default/libhugetlbfs/libhugetlbfs/tests/mremap-expand-slice-collision.sh new file mode 100755 index 0000000..8c9d98a --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/mremap-expand-slice-collision.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +. wrapper-utils.sh + +# mremap-expand-slice-collision is known broken before 2.6.33 +compare_kvers `uname -r` "2.6.33" +if [ $? -eq 1 ]; then + echo "FAIL (assumed) kernel bug" + exit $RC_FAIL +else + EXP_RC=$RC_PASS + exec_and_check $EXP_RC mremap-expand-slice-collision "$@" +fi + diff --git a/default/libhugetlbfs/libhugetlbfs/tests/mremap-fixed-huge-near-normal.c b/default/libhugetlbfs/libhugetlbfs/tests/mremap-fixed-huge-near-normal.c new file mode 100644 index 0000000..2c41813 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/mremap-fixed-huge-near-normal.c @@ -0,0 +1,145 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2009 David Gibson, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +#define RANDOM_CONSTANT 0x1234ABCD + +long hpage_size; + +void do_readback(void *p, size_t size, const char *stage) +{ + unsigned int *q = p; + int i; + + verbose_printf("do_readback(%p, 0x%lx, \"%s\")\n", p, + (unsigned long)size, stage); + + for (i = 0; i < (size / sizeof(*q)); i++) { + q[i] = RANDOM_CONSTANT ^ i; + } + + for (i = 0; i < (size / sizeof(*q)); i++) { + if (q[i] != (RANDOM_CONSTANT ^ i)) + FAIL("Stage \"%s\": Mismatch at offset 0x%x: 0x%x instead of 0x%x", + stage, i, q[i], RANDOM_CONSTANT ^ i); + } +} + +void do_remap(int fd, void *target) +{ + void *a, *b; + int rc; + + a = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (a == MAP_FAILED) + FAIL("mmap(huge page): %s", strerror(errno)); + + verbose_printf("Huge base mapping at %p\n", a); + + do_readback(a, hpage_size, "base huge"); + + verbose_printf("Attempting mremap(MAYMOVE|FIXED) to %p...", target); + + b = mremap(a, hpage_size, hpage_size, MREMAP_MAYMOVE | MREMAP_FIXED, + target); + + if (b != MAP_FAILED) { + verbose_printf("testing..."); + do_readback(b, hpage_size, "remapped"); + verbose_printf("ok\n"); + + } else { + verbose_printf("disallowed (%s)\n", strerror(errno)); + b = a; + } + + rc = munmap(b, hpage_size); + if (rc != 0) + FAIL("munmap(after remap): %s", strerror(errno)); +} + +void *map_align(size_t size, size_t align) +{ + unsigned long xsize = size + align - getpagesize(); + void *p, *q; + int rc; + + p = mmap(NULL, xsize, PROT_READ|PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + + q = PALIGN(p, align); + + rc = munmap(p, q-p); + if (rc != 0) + FAIL("munmap(lower aligning): %s", strerror(errno)); + + rc = munmap(q + size, p + xsize - (q + size)); + if (rc != 0) + FAIL("munmap(upper aligning): %s", strerror(errno)); + + + return q; +} + +int main(int argc, char *argv[]) +{ + int fd, rc; + void *p; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + p = map_align(3*hpage_size, hpage_size); + + rc = munmap(p, hpage_size); + if (rc != 0) + FAIL("munmap() low portion: %s", strerror(errno)); + + rc = munmap(p + 2*hpage_size, hpage_size); + if (rc != 0) + FAIL("munmap() high portion: %s", strerror(errno)); + + p = p + hpage_size; + + verbose_printf("Normal mapping at %p\n", p); + + do_readback(p, hpage_size, "base normal page"); + + do_remap(fd, p - hpage_size); + do_remap(fd, p + hpage_size); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/mremap-fixed-huge-near-normal.sh b/default/libhugetlbfs/libhugetlbfs/tests/mremap-fixed-huge-near-normal.sh new file mode 100755 index 0000000..4b89c35 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/mremap-fixed-huge-near-normal.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +. wrapper-utils.sh + +# mremap-fixed-huge-near-normal is known broken before 2.6.33 +compare_kvers `uname -r` "2.6.33" +if [ $? -eq 1 ]; then + echo "FAIL (assumed) kernel bug" + exit $RC_FAIL +else + EXP_RC=$RC_PASS + exec_and_check $EXP_RC mremap-fixed-huge-near-normal "$@" +fi + diff --git a/default/libhugetlbfs/libhugetlbfs/tests/mremap-fixed-normal-near-huge.c b/default/libhugetlbfs/libhugetlbfs/tests/mremap-fixed-normal-near-huge.c new file mode 100644 index 0000000..1be5f8f --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/mremap-fixed-normal-near-huge.c @@ -0,0 +1,124 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2009 David Gibson, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +#define RANDOM_CONSTANT 0x1234ABCD + +long page_size, hpage_size; + +void do_readback(void *p, size_t size, const char *stage) +{ + unsigned int *q = p; + int i; + + verbose_printf("do_readback(%p, 0x%lx, \"%s\")\n", p, + (unsigned long)size, stage); + + for (i = 0; i < (size / sizeof(*q)); i++) { + q[i] = RANDOM_CONSTANT ^ i; + } + + for (i = 0; i < (size / sizeof(*q)); i++) { + if (q[i] != (RANDOM_CONSTANT ^ i)) + FAIL("Stage \"%s\": Mismatch at offset 0x%x: 0x%x instead of 0x%x", + stage, i, q[i], RANDOM_CONSTANT ^ i); + } +} + +void do_remap(void *target) +{ + void *a, *b; + int rc; + + a = mmap(NULL, page_size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_ANONYMOUS, -1, 0); + if (a == MAP_FAILED) + FAIL("mmap(normal page): %s", strerror(errno)); + + verbose_printf("Normal base mapping at %p\n", a); + + do_readback(a, page_size, "base normal"); + + verbose_printf("Attempting mremap(MAYMOVE|FIXED) to %p...", target); + + b = mremap(a, page_size, page_size, MREMAP_MAYMOVE | MREMAP_FIXED, + target); + + if (b != MAP_FAILED) { + verbose_printf("testing..."); + do_readback(b, page_size, "remapped"); + verbose_printf("ok\n"); + } else { + verbose_printf("disallowed (%s)\n", strerror(errno)); + b = a; + } + + rc = munmap(b, page_size); + if (rc != 0) + FAIL("munmap(after remap): %s", strerror(errno)); +} + +int main(int argc, char *argv[]) +{ + int fd, rc; + void *p; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + page_size = getpagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + p = mmap(NULL, 3*hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + + rc = munmap(p, hpage_size); + if (rc != 0) + FAIL("munmap() low hpage: %s", strerror(errno)); + + rc = munmap(p + 2*hpage_size, hpage_size); + if (rc != 0) + FAIL("munmap() high hpage: %s", strerror(errno)); + + p = p + hpage_size; + + verbose_printf("Hugepage mapping at %p\n", p); + + do_readback(p, hpage_size, "base hugepage"); + + do_remap(p - page_size); + do_remap(p + hpage_size); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/mremap-fixed-normal-near-huge.sh b/default/libhugetlbfs/libhugetlbfs/tests/mremap-fixed-normal-near-huge.sh new file mode 100755 index 0000000..9ed058f --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/mremap-fixed-normal-near-huge.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +. wrapper-utils.sh + +# mremap-fixed-normal-near-huge is known broken before 2.6.33 +compare_kvers `uname -r` "2.6.33" +if [ $? -eq 1 ]; then + echo "FAIL (assumed) kernel bug" + exit $RC_FAIL +else + EXP_RC=$RC_PASS + exec_and_check $EXP_RC mremap-fixed-normal-near-huge "$@" +fi + diff --git a/default/libhugetlbfs/libhugetlbfs/tests/private.c b/default/libhugetlbfs/libhugetlbfs/tests/private.c new file mode 100644 index 0000000..8f5cb45 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/private.c @@ -0,0 +1,92 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +#define RANDOM_CONSTANT 0x1234ABCD +#define OTHER_CONSTANT 0xFEDC9876 + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p, *q; + unsigned int *pl, *ql; + int i; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() SHARED: %s", strerror(errno)); + + pl = p; + for (i = 0; i < (hpage_size / sizeof(*pl)); i++) { + pl[i] = RANDOM_CONSTANT ^ i; + } + + q = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_PRIVATE, + fd, 0); + if (q == MAP_FAILED) + FAIL("mmap() PRIVATE: %s", strerror(errno)); + + ql = q; + for (i = 0; i < (hpage_size / sizeof(*ql)); i++) { + if (ql[i] != (RANDOM_CONSTANT ^ i)) + FAIL("Mismatch"); + } + + for (i = 0; i < (hpage_size / sizeof(*ql)); i++) { + ql[i] = OTHER_CONSTANT ^ i; + } + + for (i = 0; i < (hpage_size / sizeof(*ql)); i++) { + if (ql[i] != (OTHER_CONSTANT ^ i)) + FAIL("PRIVATE mismatch"); + } + + for (i = 0; i < (hpage_size / sizeof(*pl)); i++) { + if (pl[i] != (RANDOM_CONSTANT ^ i)) + FAIL("SHARED map contaminated"); + } + + memset(p, 0, hpage_size); + + for (i = 0; i < (hpage_size / sizeof(*ql)); i++) { + if (ql[i] != (OTHER_CONSTANT ^ i)) + FAIL("PRIVATE map contaminated"); + } + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/ptrace-write-hugepage.c b/default/libhugetlbfs/libhugetlbfs/tests/ptrace-write-hugepage.c new file mode 100644 index 0000000..1896c4c --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/ptrace-write-hugepage.c @@ -0,0 +1,162 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <sys/mman.h> +#include <sys/ptrace.h> +#include <sys/types.h> +#include <sys/wait.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +#define CONST 0xdeadbeefL + +static long hpage_size; +static volatile int ready_to_trace = 0; + +static void sigchld_handler(int signum, siginfo_t *si, void *uc) +{ + int status; + pid_t pid; + + pid = wait(&status); + if (WIFEXITED(status)) + exit(WEXITSTATUS(status)); + else if (WIFSIGNALED(status)) + exit(status); + + ready_to_trace = 1; +} + +static void child(int hugefd, int pipefd) +{ + void *p; + int err; + + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + hugefd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + + memset(p, 0, hpage_size); + + verbose_printf("Child mapped data at %p\n", p); + + err = write(pipefd, &p, sizeof(p)); + if (err == -1) + FAIL("Writing to pipe: %s", strerror(errno)); + if (err != sizeof(p)) + FAIL("Short write to pipe"); + + pause(); +} + +static void do_poke(pid_t pid, void *p) +{ + long err; + + verbose_printf("Poking..."); + err = ptrace(PTRACE_POKEDATA, pid, p, (void *)CONST); + if (err) + FAIL("ptrace(POKEDATA): %s", strerror(errno)); + verbose_printf("done\n"); + + verbose_printf("Peeking..."); + err = ptrace(PTRACE_PEEKDATA, pid, p, NULL); + if (err == -1) + FAIL("ptrace(PEEKDATA): %s", strerror(errno)); + + if (err != CONST) + FAIL("mismatch (%lx instead of %lx)", err, CONST); + verbose_printf("done\n"); +} + +int main(int argc, char *argv[]) +{ + int fd; + int pipefd[2]; + long err; + pid_t cpid; + void *p; + struct sigaction sa = { + .sa_sigaction = sigchld_handler, + .sa_flags = SA_SIGINFO, + }; + struct sigaction old_sa; + + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + err = sigaction(SIGCHLD, &sa, &old_sa); + if (err) + FAIL("Can't install SIGCHLD handler: %s", strerror(errno)); + + err = pipe(pipefd); + if (err) + FAIL("pipe(): %s", strerror(errno)); + + cpid = fork(); + if (cpid < 0) + FAIL("fork(): %s", strerror(errno)); + + + if (cpid == 0) { + child(fd, pipefd[1]); + exit(0); + } + + /* Parent */ + err = read(pipefd[0], &p, sizeof(p)); + if (err == -1) + FAIL("Reading pipe: %s\n", strerror(errno)); + if (err != sizeof(p)) + FAIL("Short read over pipe"); + + verbose_printf("Parent received address %p\n", p); + + err = ptrace(PTRACE_ATTACH, cpid, NULL, NULL); + if (err) + FAIL("ptrace(ATTACH): %s", strerror(errno)); + + while (! ready_to_trace) + ; + + do_poke(cpid, p); + do_poke(cpid, p + getpagesize()); + + err = sigaction(SIGCHLD, &old_sa, NULL); + if (err) + FAIL("Clearing SIGCHLD handler: %s", strerror(errno)); + + ptrace(PTRACE_KILL, cpid, NULL, NULL); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/quota.c b/default/libhugetlbfs/libhugetlbfs/tests/quota.c new file mode 100644 index 0000000..4961371 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/quota.c @@ -0,0 +1,271 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2007 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <sys/types.h> +#include <sys/shm.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <sys/mman.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> +#include <string.h> +#include <unistd.h> +#include <sys/stat.h> +#include <sys/mount.h> +#include <fcntl.h> +#include <hugetlbfs.h> +#include <sys/vfs.h> +#include "hugetests.h" + +/* + * Test Rationale: + * + * The number of global huge pages available to a mounted hugetlbfs filesystem + * can be limited using a fs quota mechanism by setting the size attribute at + * mount time. Older kernels did not properly handle quota accounting in a + * number of cases (eg. for MAP_PRIVATE pages, and wrt MAP_SHARED reservation. + * + * This test replays some scenarios on a privately mounted filesystem to check + * for regressions in hugetlbfs quota accounting. + */ + +extern int errno; + +#define BUF_SZ 1024 + +/* Global test configuration */ +static long hpage_size; +char *mountpoint = NULL; + +/* map action flags */ +#define ACTION_COW 0x0001 +#define ACTION_TOUCH 0x0002 + +/* Testlet results */ +#define GOOD 0 +#define BAD_SIG 1 +#define BAD_EXIT 2 + +char result_str[3][10] = { "pass", "killed", "fail" }; + +void cleanup(void) +{ + if (mountpoint && (umount(mountpoint) == 0)) + rmdir(mountpoint); +} + +/* + * Debugging function: Verify the counters in the hugetlbfs superblock that + * are used to implement the filesystem quotas. + */ +void _verify_stat(int line, long tot, long free, long avail) +{ + struct statfs s; + statfs(mountpoint, &s); + + if (s.f_blocks != tot || s.f_bfree != free || s.f_bavail != avail) + FAIL("Bad quota counters at line %i: total: %li free: %li " + "avail: %li\n", line, s.f_blocks, s.f_bfree, s.f_bavail); +} +#define verify_stat(t, f, a) _verify_stat(__LINE__, t, f, a) + +void get_quota_fs(unsigned long size, char *prog) +{ + char mount_str[17]; + char mount_opts[50]; + int nr_written; + + nr_written = snprintf(mount_opts, 20, "size=%luK", size/1024); + + /* + * If the mount point now in use does not use the system default + * huge page size, specify the desired size when mounting. When + * the sizes do match, we avoid specifying the pagesize= option to + * preserve backwards compatibility with kernels that do not + * recognize that option. + */ + if (!using_system_hpage_size(hugetlbfs_find_path())) + snprintf(mount_opts + nr_written, 29, ",pagesize=%lu", + hpage_size); + + sprintf(mount_str, "/tmp/huge-XXXXXX"); + if (!mkdtemp(mount_str)) + FAIL("Cannot create directory for mountpoint: %s", + strerror(errno)); + + if (mount("none", mount_str, "hugetlbfs", 0, mount_opts)) { + perror("mount"); + FAIL(); + } + mountpoint = mount_str; + + /* + * Set HUGETLB_PATH and then exec the test again. This will cause + * libhugetlbfs to use this newly created mountpoint. + */ + if (setenv("HUGETLB_PATH", mount_str, 1)) + FAIL("Cannot set HUGETLB_PATH environment variable: %s", + strerror(errno)); + verbose_printf("Using %s as temporary mount point.\n", mount_str); + + execlp(prog, prog, "-p", mount_str, NULL); + FAIL("execle failed: %s", strerror(errno)); +} + +void map(unsigned long size, int mmap_flags, int action_flags) +{ + int fd; + char *a, *b, *c; + + fd = hugetlbfs_unlinked_fd(); + if (!fd) { + verbose_printf("hugetlbfs_unlinked_fd () failed\n"); + exit(1); + } + + a = mmap(0, size, PROT_READ|PROT_WRITE, mmap_flags, fd, 0); + if (a == MAP_FAILED) { + verbose_printf("mmap failed: %s\n", strerror(errno)); + exit(1); + } + + + if (action_flags & ACTION_TOUCH) + for (b = a; b < a + size; b += hpage_size) + *(b) = 1; + + if (action_flags & ACTION_COW) { + c = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); + if (c == MAP_FAILED) { + verbose_printf("Creating COW mapping failed: %s\n", strerror(errno)); + exit(1); + } + if ((*c) != 1) { + verbose_printf("Data mismatch when setting up COW"); + exit(1); + } + (*c) = 0; + munmap(c, size); + } + + munmap(a, size); + close(fd); +} + +void do_unexpected_result(int line, int expected, int actual) +{ + FAIL("Unexpected result on line %i: expected %s, actual %s", + line, result_str[expected], result_str[actual]); +} + +void _spawn(int l, int expected_result, unsigned long size, int mmap_flags, + int action_flags) +{ + pid_t pid; + int status; + int actual_result; + + pid = fork(); + if (pid == 0) { + map(size, mmap_flags, action_flags); + exit(0); + } else if (pid < 0) { + FAIL("fork(): %s", strerror(errno)); + } else { + waitpid(pid, &status, 0); + if (WIFEXITED(status)) { + if (WEXITSTATUS(status) == 0) + actual_result = GOOD; + else + actual_result = BAD_EXIT; + } else { + actual_result = BAD_SIG; + } + + if (actual_result != expected_result) + do_unexpected_result(l, expected_result, actual_result); + } +} +#define spawn(e,s,mf,af) _spawn(__LINE__, e, s, mf, af) + +int main(int argc, char ** argv) +{ + int private_resv; + int bad_priv_resv; + + test_init(argc, argv); + hpage_size = check_hugepagesize(); + + if ((argc == 3) && !strcmp(argv[1], "-p")) + mountpoint = argv[2]; + else + get_quota_fs(hpage_size, argv[0]); + + check_must_be_root(); + check_free_huge_pages(1); + + private_resv = kernel_has_private_reservations(); + if (private_resv == -1) + FAIL("kernel_has_private_reservations() failed\n"); + bad_priv_resv = private_resv ? BAD_EXIT : BAD_SIG; + + /* + * Check that unused quota is cleared when untouched mmaps are + * cleaned up. + */ + spawn(GOOD, hpage_size, MAP_PRIVATE, 0); + verify_stat(1, 1, 1); + spawn(GOOD, hpage_size, MAP_SHARED, 0); + verify_stat(1, 1, 1); + + /* + * Check that simple page instantiation works within quota limits + * for private and shared mappings. + */ + spawn(GOOD, hpage_size, MAP_PRIVATE, ACTION_TOUCH); + spawn(GOOD, hpage_size, MAP_SHARED, ACTION_TOUCH); + + /* + * Page instantiation should be refused if doing so puts the fs + * over quota. + */ + spawn(BAD_EXIT, 2 * hpage_size, MAP_SHARED, ACTION_TOUCH); + + /* + * If private mappings are reserved, the quota is checked up front + * (as is the case for shared mappings). + */ + spawn(bad_priv_resv, 2 * hpage_size, MAP_PRIVATE, ACTION_TOUCH); + + /* + * COW should not be allowed if doing so puts the fs over quota. + */ + spawn(bad_priv_resv, hpage_size, MAP_SHARED, ACTION_TOUCH|ACTION_COW); + spawn(bad_priv_resv, hpage_size, MAP_PRIVATE, ACTION_TOUCH|ACTION_COW); + + /* + * Make sure that operations within the quota will succeed after + * some failures. + */ + spawn(GOOD, hpage_size, MAP_SHARED, ACTION_TOUCH); + spawn(GOOD, hpage_size, MAP_PRIVATE, ACTION_TOUCH); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/quota.sh b/default/libhugetlbfs/libhugetlbfs/tests/quota.sh new file mode 100755 index 0000000..398d442 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/quota.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +. wrapper-utils.sh + +# There are known bugs in quota accounting prior to 2.6.24 +compare_kvers `uname -r` "2.6.24" +if [ $? -eq 1 ]; then + EXP_RC=$RC_FAIL +else + EXP_RC=$RC_PASS +fi + +exec_and_check $EXP_RC quota "$@" diff --git a/default/libhugetlbfs/libhugetlbfs/tests/readahead_reserve.c b/default/libhugetlbfs/libhugetlbfs/tests/readahead_reserve.c new file mode 100644 index 0000000..d0a478b --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/readahead_reserve.c @@ -0,0 +1,86 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#define _GNU_SOURCE +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> +#include "hugetests.h" + +/* + * Test rationale: + * + * readahead() on some kernels can cause the reservation counter to get + * corrupted. The problem is that the patches are allocated for the + * reservation but not faulted in at the time of allocation. The + * counters do not get updated and effectively "leak". This test + * identifies whether the kernel is vunerable to the problem or not. + * It's fixed in kernel by commit f2deae9d4e70793568ef9e85d227abb7bef5b622. + */ +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p; + unsigned long initial_rsvd, map_rsvd, readahead_rsvd, end_rsvd; + + test_init(argc, argv); + + /* Setup */ + hpage_size = check_hugepagesize(); + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + initial_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count before map: %lu\n", initial_rsvd); + + /* mmap a region and record reservations */ + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + map_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count after map: %lu\n", map_rsvd); + + /* readahead the region and record reservations */ + if (readahead(fd, 0, hpage_size) == -1) + FAIL("readahead(): %s", strerror(errno)); + readahead_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count after readahead: %lu\n", readahead_rsvd); + + /* Write the region */ + memset(p, 1, hpage_size); + + /* Free region */ + munmap(p, hpage_size); + close(fd); + end_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count after close(): %lu\n", end_rsvd); + + /* Reserve count should match initial reserve count */ + if (end_rsvd != initial_rsvd) + FAIL("Reserve leaked: %lu != %lu\n", end_rsvd, initial_rsvd); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/readahead_reserve.sh b/default/libhugetlbfs/libhugetlbfs/tests/readahead_reserve.sh new file mode 100755 index 0000000..5ab7400 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/readahead_reserve.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +. wrapper-utils.sh + +# readahead is known broken before 2.6.30 +compare_kvers `uname -r` "2.6.30" +if [ $? -eq 1 ]; then + echo "FAIL (assumed) kernel bug" + exit $RC_FAIL +else + EXP_RC=$RC_PASS + exec_and_check $EXP_RC readahead_reserve "$@" +fi + diff --git a/default/libhugetlbfs/libhugetlbfs/tests/readback.c b/default/libhugetlbfs/libhugetlbfs/tests/readback.c new file mode 100644 index 0000000..984369c --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/readback.c @@ -0,0 +1,64 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +#define RANDOM_CONSTANT 0x1234ABCD + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p; + unsigned int *q; + int i; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + + q = p; + for (i = 0; i < (hpage_size / sizeof(*q)); i++) { + q[i] = RANDOM_CONSTANT ^ i; + } + + for (i = 0; i < (hpage_size / sizeof(*q)); i++) { + if (q[i] != (RANDOM_CONSTANT ^ i)) + FAIL("Mismatch at offset 0x%x: 0x%x instead of 0x%x", + i, q[i], RANDOM_CONSTANT ^ i); + } + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/run_tests.py b/default/libhugetlbfs/libhugetlbfs/tests/run_tests.py new file mode 100755 index 0000000..8055940 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/run_tests.py @@ -0,0 +1,704 @@ +#! /usr/bin/env python + +import subprocess +import types +import os +import sys +import getopt +import resource + +# The superset of wordsizes that should be tested (default 32, 64) +wordsizes = set() + +# The super set of page sizes that should be tested. Defaults to all supported +# huge page sizes with an active mount and at least one huge page allocated +pagesizes = set() + +# Each page size may have a subset of valid wordsizes +# This is a dictionary (indexed by page size) of sets +wordsizes_by_pagesize = {} + +# The linkhuge tests may only be valid on a subset of word sizes +# This set contains the wordsizes valid for linkhuge tests +linkhuge_wordsizes = set() + +# A list of all discovered mountpoints that may be used by libhugetlbfs for +# this run of tests. This is used for cleaning up left-over share files. +mounts = [] + +# Results matrix: This 3-D dictionary is indexed as follows: +# [type] - Test results fall into one of the 'result_types' categories +# [pagesize] - a page size from the set 'pagesizes' +# [bits] - a word size from the set 'wordsizes' +# The indexed value is the number of tests matching the above traits +R = {} +result_types = ("total", "pass", "config", "fail", "xfail", "xpass", + "signal", "strange", "skip") + +def bash(cmd): + """ + Run 'cmd' in the shell and return the exit code and output. + """ + p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) + try: + rc = p.wait() + except KeyboardInterrupt: + # Abort and mark this a strange test result + return (127, "") + out = p.stdout.read().strip() + return (rc, out) + +def snapshot_pool_state(): + l = [] + for d in os.listdir("/sys/kernel/mm/hugepages"): + substate = [(f, int(open("/sys/kernel/mm/hugepages/%s/%s" % (d, f)).read())) + for f in os.listdir("/sys/kernel/mm/hugepages/%s" % d)] + l.append((d, tuple(substate))) + return tuple(l) + +def run_test_prog(bits, pagesize, cmd, **env): + if paranoid_pool_check: + beforepool = snapshot_pool_state() + print "Pool state: %s" % str(beforepool) + + local_env = os.environ.copy() + local_env.update(env) + local_env["PATH"] = "./obj%d:../obj%d:%s" \ + % (bits, bits, local_env.get("PATH", "")) + local_env["LD_LIBRARY_PATH"] = "../obj%d:obj%d:%s" \ + % (bits, bits, local_env.get("LD_LIBRARY_PATH", "")) + local_env["HUGETLB_DEFAULT_PAGE_SIZE"] = repr(pagesize) + + try: + p = subprocess.Popen(cmd, env=local_env, stdout=subprocess.PIPE) + rc = p.wait() + except KeyboardInterrupt: + # Abort and mark this a strange test result + return (None, "") + except OSError: + return (None, "") + out = p.stdout.read().strip() + + if paranoid_pool_check: + afterpool = snapshot_pool_state() + if afterpool != beforepool: + print >>sys.stderr, "Hugepage pool state not preserved!" + print >>sys.stderr, "BEFORE: %s" % str(beforepool) + print >>sys.stderr, "AFTER: %s" % str(afterpool) + sys.exit(98) + + return (rc, out) + +def setup_env(override, defaults): + """ + Set up the environment for running commands in the shell. + """ + # All items in override are unconditionally set or unset + for (var, val) in override.items(): + if val == None: + if var in os.environ: + del os.environ[var] + else: + os.environ[var] = val + # If not already set, these variables are given default values + for (var, val) in defaults.items(): + if var not in os.environ or os.environ[var] == "": + os.environ[var] = val + +def init_results(): + """ + Define the structure of the results matrix and initialize all results to 0. + """ + global R + + for t in result_types: + R[t] = {} + for p in pagesizes: + R[t][p] = {} + for bits in (32, 64): + R[t][p][bits] = 0 + +def pretty_page_size(size): + """ + Convert a page size to a formatted string + + Given a page size in bytes, return a string that expresses the size in + a sensible unit (K, M, or G). + """ + factor = 0 + while size > 1024: + factor += 1 + size /= 1024 + + if factor == 0: return "%iB" % size + elif factor == 1: return "%iK" % size + elif factor == 2: return "%iM" % size + elif factor == 3: return "%iG" % size + +def print_per_size(title, values): + """ + Print one line of test results + + Print the results of a given result type on one line. The results for all + page sizes and word sizes are written in a table format. + """ + print "*%20s: " % title, + for sz in pagesizes: + print "%4s %4s " % (values[sz][32], values[sz][64]), + print + +def results_summary(): + """ + Display a summary of the test results + """ + print "********** TEST SUMMARY" + print "*%21s" % "", + for p in pagesizes: print "%-13s " % pretty_page_size(p), + print + print "*%21s" % "", + for p in pagesizes: print "32-bit 64-bit ", + print + + print_per_size("Total testcases", R["total"]) + print_per_size("Skipped", R["skip"]) + print_per_size("PASS", R["pass"]) + print_per_size("FAIL", R["fail"]) + print_per_size("Killed by signal", R["signal"]) + print_per_size("Bad configuration", R["config"]) + print_per_size("Expected FAIL", R["xfail"]) + print_per_size("Unexpected PASS", R["xpass"]) + print_per_size("Strange test result", R["strange"]) + print "**********" + +def free_hpages(): + """ + Return the number of free huge pages. + + Parse /proc/meminfo to obtain the number of free huge pages for + the default page size. + XXX: This function is not multi-size aware yet. + """ + (rc, out) = bash("grep 'HugePages_Free:' /proc/meminfo | cut -f2 -d:") + return (rc, int(out)) + +def total_hpages(): + """ + Return the total number of huge pages in the pool. + + Parse /proc/meminfo to obtain the number of huge pages for the default + page size. + XXX: This function is not multi-size aware yet. + """ + (rc, out) = bash("grep 'HugePages_Total:' /proc/meminfo | cut -f2 -d:") + return (rc, int(out)) + +def hpage_size(): + """ + Return the size of the default huge page size in bytes. + + Parse /proc/meminfo to obtain the default huge page size. This number is + reported in Kb so multiply it by 1024 to convert it to bytes. + XXX: This function is not multi-size aware yet. + """ + (rc, out) = bash("grep 'Hugepagesize:' /proc/meminfo | awk '{print $2}'") + if out == "": out = 0 + out = int(out) * 1024 + return (rc, out) + +def clear_hpages(): + """ + Remove stale hugetlbfs files after sharing tests. + + Traverse the mount points that are in use during testing to find left-over + files that were created by the elflink sharing tests. These are not + cleaned up automatically and must be removed to free up the huge pages. + """ + for mount in mounts: + dir = mount + "/elflink-uid-" + `os.getuid()` + for root, dirs, files in os.walk(dir, topdown=False): + for name in files: + os.remove(os.path.join(root, name)) + for name in dirs: + os.rmdir(os.path.join(root, name)) + try: + os.rmdir(dir) + except OSError: + pass + +def get_pagesizes(): + """ + Get a list of configured huge page sizes. + + Use libhugetlbfs' hugeadm utility to get a list of page sizes that have + active mount points and at least one huge page allocated to the pool. + """ + sizes = set() + out = "" + (rc, out) = bash("../obj/hugeadm --page-sizes") + if rc != 0 or out == "": return sizes + + for size in out.split("\n"): sizes.add(int(size)) + return sizes + +def get_wordsizes(): + """ + Checks for obj32 and obj64 subdirs to determine valid word sizes. + """ + sizes = set() + if os.path.isdir("./obj32"): sizes.add(32) + if os.path.isdir("./obj64"): sizes.add(64) + + return sizes + +def check_hugetlbfs_path(): + """ + Check each combination of page size and word size for validity. + + Some word sizes may not be valid for all page sizes. For example, a 16G + page is too large to be used in a 32 bit process. Use a helper program to + weed out invalid combinations and print informational messages as required. + """ + global wordsizes, pagesizes, mounts, wordsizes_by_pagesize + + for p in pagesizes: + okbits = [] + for b in wordsizes: + (rc, out) = run_test_prog(b, p, "get_hugetlbfs_path") + if rc == 0: + okbits.append(b) + mounts.append(out) + if len(okbits) == 0: + print "run_tests.py: No mountpoints available for page size %s" % \ + pretty_page_size(p) + wordsizes_by_pagesize[p] = set() + continue + for b in wordsizes - set(okbits): + print "run_tests.py: The %i bit word size is not compatible with " \ + "%s pages" % (b, pretty_page_size(p)) + wordsizes_by_pagesize[p] = set(okbits) + +def check_linkhuge_tests(): + """ + Check if the linkhuge tests are safe to run on this system. + + Newer versions of binutils (>= 2.18) are known to be incompatible with the + linkhuge tests and running them may cause unreliable behavior. Determine + which word sizes can be tested with linkhuge. The others will be skipped. + NOTE: The linhuge_rw tests are always safe to run and will not be skipped. + """ + okbits = set() + + for bits in wordsizes: + script = open('obj%d/dummy.ldscript' % bits, 'r').read() + if script.count('SPECIAL') == 0: + okbits.add(bits) + return okbits + +def print_cmd(pagesize, bits, cmd, env): + if env: + print ' '.join(['%s=%s' % (k, v) for k, v in env.items()]), + if type(cmd) != types.StringType: + cmd = ' '.join(cmd) + print "%s (%s: %i):\t" % (cmd, pretty_page_size(pagesize), bits), + sys.stdout.flush() + +def run_test(pagesize, bits, cmd, **env): + """ + Execute a test, print the output and log the result + + Run a test using the specified page size and word size. The parameter + 'pre' may contain additional environment settings and will be prepended to + cmd. A line showing info about the test is printed and after completion + the test output is printed. The result is recorded in the result matrix. + """ + global R + + objdir = "obj%i" % bits + if not os.path.isdir(objdir): + return + + print_cmd(pagesize, bits, cmd, env) + (rc, out) = run_test_prog(bits, pagesize, cmd, **env) + print out + + R["total"][pagesize][bits] += 1 + if rc == 0: R["pass"][pagesize][bits] += 1 + elif rc == 1: R["config"][pagesize][bits] += 1 + elif rc == 2: R["fail"][pagesize][bits] += 1 + elif rc == 3: R["xfail"][pagesize][bits] += 1 + elif rc == 4: R["xpass"][pagesize][bits] += 1 + elif rc < 0: R["signal"][pagesize][bits] += 1 + else: R["strange"][pagesize][bits] += 1 + +def skip_test(pagesize, bits, cmd, **env): + """ + Skip a test, print test information, and log that it was skipped. + """ + global tot_tests, tot_skip + R["total"][pagesize][bits] += 1 + R["skip"][pagesize][bits] += 1 + print_cmd(pagesize, bits, cmd, env) + print "SKIPPED" + +def do_test(cmd, bits=None, **env): + """ + Run a test case, testing each page size and each indicated word size. + """ + if bits == None: bits = wordsizes + for p in pagesizes: + for b in (set(bits) & wordsizes_by_pagesize[p]): + run_test(p, b, cmd, **env) + +def do_test_with_rlimit(rtype, limit, cmd, bits=None, **env): + """ + Run a test case with a temporarily altered resource limit. + """ + oldlimit = resource.getrlimit(rtype) + resource.setrlimit(rtype, (limit, limit)) + do_test(cmd, bits, **env) + resource.setrlimit(rtype, oldlimit) + +def do_elflink_test(cmd, **env): + """ + Run an elflink test case, skipping known-bad configurations. + """ + for p in pagesizes: + for b in wordsizes_by_pagesize[p]: + if b in linkhuge_wordsizes: run_test(p, b, cmd, **env) + else: skip_test(p, b, cmd, **env) + +def elflink_test(cmd, **env): + """ + Run an elflink test case with different configuration combinations. + + Test various combinations of: preloading libhugetlbfs, B vs. BDT link + modes, minimal copying on or off, and disabling segment remapping. + """ + do_test(cmd, **env) + # Test we don't blow up if not linked for hugepage + do_test(cmd, LD_PRELOAD="libhugetlbfs.so", **env) + do_elflink_test("xB." + cmd, **env) + do_elflink_test("xBDT." + cmd, **env) + # Test we don't blow up if HUGETLB_MINIMAL_COPY is diabled + do_elflink_test("xB." + cmd, HUGETLB_MINIMAL_COPY="no", **env) + do_elflink_test("xBDT." + cmd, HUGETLB_MINIMAL_COPY="no", **env) + # Test that HUGETLB_ELFMAP=no inhibits remapping as intended + do_elflink_test("xB." + cmd, HUGETLB_ELFMAP="no", **env) + do_elflink_test("xBDT." + cmd, HUGETLB_ELFMAP="no", **env) + +def elflink_rw_test(cmd, **env): + """ + Run the elflink_rw test with different configuration combinations. + + Test various combinations of: remapping modes and minimal copy on or off. + """ + # Basic tests: None, Read-only, Write-only, Read-Write, exlicit disable + do_test(cmd, **env) + do_test(cmd, HUGETLB_ELFMAP="R", **env) + do_test(cmd, HUGETLB_ELFMAP="W", **env) + do_test(cmd, HUGETLB_ELFMAP="RW", **env) + do_test(cmd, HUGETLB_ELFMAP="no", **env) + + # Test we don't blow up if HUGETLB_MINIMAL_COPY is disabled + do_test(cmd, HUGETLB_MINIMAL_COPY="no", HUGETLB_ELFMAP=R"", **env) + do_test(cmd, HUGETLB_MINIMAL_COPY="no", HUGETLB_ELFMAP="W", **env) + do_test(cmd, HUGETLB_MINIMAL_COPY="no", HUGETLB_ELFMAP="RW", **env) + +def elfshare_test(cmd, **env): + """ + Test segment sharing with multiple configuration variations. + """ + # Run each elfshare test invocation independently - clean up the + # sharefiles before and after in the first set of runs, but leave + # them there in the second: + clear_hpages() + do_elflink_test("xB." + cmd, HUGETLB_SHARE="1", **env) + clear_hpages() + do_elflink_test("xBDT." + cmd, HUGETLB_SHARE="1", **env) + clear_hpages() + do_elflink_test("xB." + cmd, HUGETLB_SHARE="1", **env) + do_elflink_test("xBDT." + cmd, HUGETLB_SHARE="1", **env) + clear_hpages() + +def elflink_and_share_test(cmd, **env): + """ + Run the ordinary linkhuge tests with sharing enabled + """ + # Run each elflink test pair independently - clean up the sharefiles + # before and after each pair + clear_hpages() + for link_str in ("xB.", "xBDT."): + for i in range(2): + do_elflink_test(link_str + cmd, HUGETLB_SHARE=repr(i), **env) + clear_hpages() + +def elflink_rw_and_share_test(cmd, **env): + """ + Run the ordinary linkhuge_rw tests with sharing enabled + """ + clear_hpages() + for mode in ("R", "W", "RW"): + for i in range(2): + do_test(cmd, HUGETLB_ELFMAP=mode, HUGETLB_SHARE=repr(i), **env) + clear_hpages() + +def setup_shm_sysctl(limit): + """ + Adjust the kernel shared memory limits to accomodate a desired size. + + The original values are returned in a dictionary that can be passed to + restore_shm_sysctl() to restore the system state. + """ + if os.getuid() != 0: return {} + sysctls = {} + files = [ "/proc/sys/kernel/shmmax", "/proc/sys/kernel/shmall"] + for f in files: + fh = open(f, "r") + sysctls[f] = fh.read() + fh.close() + fh = open(f, "w") + fh.write(`limit`) + fh.close() + print "set shmmax limit to %s" % limit + return sysctls + +def restore_shm_sysctl(sysctls): + """ + Restore the sysctls named in 'sysctls' to the given values. + """ + if os.getuid() != 0: return + for (file, val) in sysctls.items(): + fh = open(file, "w") + fh.write(val) + fh.close() + +def do_shm_test(cmd, limit=None, bits=None, **env): + """ + Run a test case with temporarily expanded SysV shm limits, testing + each indicated word size. + """ + if bits == None: + bits = wordsizes + if limit != None: + tmp = setup_shm_sysctl(limit) + for b in bits: + run_test(system_default_hpage_size, b, cmd, **env) + if limit != None: + restore_shm_sysctl(tmp) + +def functional_tests(): + """ + Run the set of functional tests. + """ + global linkhuge_wordsizes + + # Kernel background tests not requiring hugepage support + do_test("zero_filesize_segment") + + # Library background tests not requiring hugepage support + do_test("test_root") + do_test("meminfo_nohuge") + + # Library tests requiring kernel hugepage support + do_test("gethugepagesize") + do_test("gethugepagesizes") + do_test("empty_mounts", HUGETLB_VERBOSE="1") + do_test("large_mounts", HUGETLB_VERBOSE="1") + + # Tests requiring an active and usable hugepage mount + do_test("find_path") + do_test("unlinked_fd") + do_test("readback") + do_test("truncate") + do_test("shared") + do_test("mprotect") + do_test_with_rlimit(resource.RLIMIT_MEMLOCK, -1, "mlock") + do_test("misalign") + + # Specific kernel bug tests + do_test("mremap-expand-slice-collision") + do_test("mremap-fixed-huge-near-normal") + do_test("mremap-fixed-normal-near-huge") + do_test("ptrace-write-hugepage") + do_test("icache-hygiene") + do_test("slbpacaflush") + do_test("straddle_4GB", bits=(64,)) + do_test("huge_at_4GB_normal_below", bits=(64,)) + do_test("huge_below_4GB_normal_above", bits=(64,)) + do_test("map_high_truncate_2") + do_test("misaligned_offset") + do_test("truncate_above_4GB") + do_test("brk_near_huge") + do_test("task-size-overrun") + do_test_with_rlimit(resource.RLIMIT_STACK, -1, "stack_grow_into_huge") + + if dangerous == 1: + do_test("readahead_reserve") + do_test("madvise_reserve") + do_test("fadvise_reserve") + do_test("mremap-expand-slice-collision") + do_test("mremap-fixed-normal-near-huge") + do_test("mremap-fixed-huge-near-normal") + else: + do_test("readahead_reserve.sh") + do_test("madvise_reserve.sh") + do_test("fadvise_reserve.sh") + do_test("mremap-expand-slice-collision.sh") + do_test("mremap-fixed-normal-near-huge.sh") + do_test("mremap-fixed-huge-near-normal.sh") + do_shm_test("shm-perms", 64*1024*1024) + + # Tests requiring an active mount and hugepage COW + do_test("private") + do_test("fork-cow") + do_test("direct") + do_test("malloc") + do_test("malloc", LD_PRELOAD="libhugetlbfs.so", HUGETLB_MORECORE="yes") + do_test("malloc", LD_PRELOAD="libhugetlbfs.so", HUGETLB_MORECORE="yes", + HUGETLB_RESTRICT_EXE="unknown:none") + do_test("malloc", LD_PRELOAD="libhugetlbfs.so", HUGETLB_MORECORE="yes", + HUGETLB_RESTRICT_EXE="unknown:malloc") + do_test("malloc_manysmall") + do_test("malloc_manysmall", LD_PRELOAD="libhugetlbfs.so", + HUGETLB_MORECORE="yes") + do_test("heapshrink") + do_test("heapshrink", LD_PRELOAD="libheapshrink.so") + do_test("heapshrink", LD_PRELOAD="libhugetlbfs.so", HUGETLB_MORECORE="yes") + do_test("heapshrink", LD_PRELOAD="libhugetlbfs.so libheapshrink.so", + HUGETLB_MORECORE="yes") + do_test("heapshrink", LD_PRELOAD="libheapshrink.so", HUGETLB_MORECORE="yes", + HUGETLB_MORECORE_SHRINK="es") + do_test("heapshrink", LD_PRELOAD="libhugetlbfs.so libheapshrink.so", + HUGETLB_MORECORE="yes", HUGETLB_MORECORE_SHRINK="yes") + do_test("heap-overflow", HUGETLB_VERBOSE="1", HUGETLB_MORECORE="yes") + + # Run the remapping tests' up-front checks + linkhuge_wordsizes = check_linkhuge_tests() + # Original elflink tests + elflink_test("linkhuge_nofd", HUGETLB_VERBOSE="0") + elflink_test("linkhuge") + + # Original elflink sharing tests + elfshare_test("linkshare") + elflink_and_share_test("linkhuge") + + # elflink_rw tests + elflink_rw_test("linkhuge_rw") + # elflink_rw sharing tests + elflink_rw_and_share_test("linkhuge_rw") + + # Accounting bug tests + # reset free hpages because sharing will have held some + # alternatively, use + do_test("chunk-overcommit") + do_test(("alloc-instantiate-race", "shared")) + do_test(("alloc-instantiate-race", "private")) + do_test("truncate_reserve_wraparound") + do_test("truncate_sigbus_versus_oom") + + # Test direct allocation API + do_test("get_huge_pages") + + # Test overriding of shmget() + do_shm_test("shmoverride_linked") + do_shm_test("shmoverride_linked", HUGETLB_SHM="yes") + do_shm_test("shmoverride_linked_static") + do_shm_test("shmoverride_linked_static", HUGETLB_SHM="yes") + do_shm_test("shmoverride_unlinked", LD_PRELOAD="libhugetlbfs.so") + do_shm_test("shmoverride_unlinked", LD_PRELOAD="libhugetlbfs.so", HUGETLB_SHM="yes") + + # Test hugetlbfs filesystem quota accounting + do_test("quota.sh") + + # Test accounting of HugePages_{Total|Free|Resv|Surp} + # Alters the size of the hugepage pool so should probably be run last + do_test("counters.sh") + +def stress_tests(): + """ + Run the set of stress tests. + """ + iterations = 10 # Number of iterations for looping tests + + # Don't update NRPAGES every time like above because we want to catch the + # failures that happen when the kernel doesn't release all of the huge pages + # after a stress test terminates + (rc, nr_pages) = free_hpages() + + do_test(("mmap-gettest", repr(iterations), repr(nr_pages))) + + # mmap-cow needs a hugepages for each thread plus one extra + do_test(("mmap-cow", repr(nr_pages-1), repr(nr_pages))) + + (rc, tot_pages) = total_hpages() + limit = system_default_hpage_size * tot_pages + threads = 10 # Number of threads for shm-fork + + # Run shm-fork once using half available hugepages, then once using all + # This is to catch off-by-ones or races in the kernel allocated that + # can make allocating all hugepages a problem + if nr_pages > 1: + do_shm_test(("shm-fork", repr(threads), repr(nr_pages / 2)), limit) + do_shm_test(("shm-fork", repr(threads), repr(nr_pages)), limit) + + do_shm_test(("shm-getraw", repr(nr_pages), "/dev/full"), limit) + +def main(): + global wordsizes, pagesizes, dangerous, paranoid_pool_check, system_default_hpage_size + testsets = set() + env_override = {"QUIET_TEST": "1", "HUGETLBFS_MOUNTS": "", + "HUGETLB_ELFMAP": None, "HUGETLB_MORECORE": None} + env_defaults = {"HUGETLB_VERBOSE": "0"} + dangerous = 0 + paranoid_pool_check = False + + try: + opts, args = getopt.getopt(sys.argv[1:], "vVfdt:b:p:c") + except getopt.GetoptError, err: + print str(err) + sys.exit(1) + for opt, arg in opts: + if opt == "-v": + env_override["QUIET_TEST"] = None + env_defaults["HUGETLB_VERBOSE"] = "2" + elif opt == "-V": + env_defaults["HUGETLB_VERBOSE"] = "99" + elif opt == "-f": + dangerous = 1 + elif opt == "-t": + for t in arg.split(): testsets.add(t) + elif opt == "-b": + for b in arg.split(): wordsizes.add(int(b)) + elif opt == "-p": + for p in arg.split(): pagesizes.add(int(p)) + elif opt == '-c': + paranoid_pool_check = True + else: + assert False, "unhandled option" + if len(testsets) == 0: testsets = set(["func", "stress"]) + if len(wordsizes) == 0: wordsizes = get_wordsizes() + if len(pagesizes) == 0: pagesizes = get_pagesizes() + + if len(pagesizes) == 0: + print "Unable to find available page sizes, are you sure hugetlbfs" + print "is mounted and there are available huge pages?" + return 1 + + setup_env(env_override, env_defaults) + init_results() + + (rc, system_default_hpage_size) = hpage_size() + if rc != 0: + print "Unable to find system default hugepage size." + print "Is hugepage supported included in this kernel?" + return 1 + + check_hugetlbfs_path() + + if "func" in testsets: functional_tests() + if "stress" in testsets: stress_tests() + + results_summary() + +if __name__ == "__main__": + main() diff --git a/default/libhugetlbfs/libhugetlbfs/tests/shared.c b/default/libhugetlbfs/libhugetlbfs/tests/shared.c new file mode 100644 index 0000000..e04fb04 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/shared.c @@ -0,0 +1,71 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +#define RANDOM_CONSTANT 0x1234ABCD + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p, *q; + unsigned int *pl, *ql; + int i; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() 1: %s", strerror(errno)); + + q = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (q == MAP_FAILED) + FAIL("mmap() 2: %s", strerror(errno)); + + pl = p; + for (i = 0; i < (hpage_size / sizeof(*pl)); i++) { + pl[i] = RANDOM_CONSTANT ^ i; + } + + ql = q; + for (i = 0; i < (hpage_size / sizeof(*ql)); i++) { + if (ql[i] != (RANDOM_CONSTANT ^ i)) + FAIL("Mismatch"); + } + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/shm-fork.c b/default/libhugetlbfs/libhugetlbfs/tests/shm-fork.c new file mode 100644 index 0000000..62d2781 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/shm-fork.c @@ -0,0 +1,136 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/shm.h> +#include <sys/mman.h> +#include <sys/wait.h> +#include <hugetlbfs.h> +#include "hugetests.h" + +#define P "shm-fork" +#define DESC \ + "* Test shared memory behavior when multiple threads are attached *\n"\ + "* to a segment. A segment is created and then children are *\n"\ + "* spawned which attach, write, read (verify), and detach from the *\n"\ + "* shared memory segment. *" + +extern int errno; + +/* Global Configuration */ +static int nr_hugepages; +static int numprocs; +static int shmid = -1; + +#define MAX_PROCS 200 +#define BUF_SZ 256 + +#define CHILD_FAIL(thread, fmt, ...) \ + do { \ + verbose_printf("Thread %d (pid=%d) FAIL: " fmt, \ + thread, getpid(), __VA_ARGS__); \ + exit(1); \ + } while (0) + +void cleanup(void) +{ + remove_shmid(shmid); +} + +static void do_child(int thread, unsigned long size) +{ + volatile char *shmaddr; + int j, k; + + verbose_printf("."); + for (j=0; j<5; j++) { + shmaddr = shmat(shmid, 0, SHM_RND); + if (shmaddr == MAP_FAILED) + CHILD_FAIL(thread, "shmat() failed: %s", + strerror(errno)); + + for (k=0;k<size;k++) + shmaddr[k] = (char) (k); + for (k=0;k<size;k++) + if (shmaddr[k] != (char)k) + CHILD_FAIL(thread, "Index %d mismatch", k); + + if (shmdt((const void *)shmaddr) != 0) + CHILD_FAIL(thread, "shmdt() failed: %s", + strerror(errno)); + } + exit(0); +} + +int main(int argc, char ** argv) +{ + unsigned long size; + long hpage_size; + int pid, status; + int i; + int wait_list[MAX_PROCS]; + + test_init(argc, argv); + + if (argc < 3) + CONFIG("Usage: %s <# procs> <# pages>", argv[0]); + + numprocs = atoi(argv[1]); + nr_hugepages = atoi(argv[2]); + + if (numprocs > MAX_PROCS) + CONFIG("Cannot spawn more than %d processes", MAX_PROCS); + + check_hugetlb_shm_group(); + + hpage_size = check_hugepagesize(); + size = hpage_size * nr_hugepages; + verbose_printf("Requesting %lu bytes\n", size); + if ((shmid = shmget(2, size, SHM_HUGETLB|IPC_CREAT|SHM_R|SHM_W )) < 0) + FAIL("shmget(): %s", strerror(errno)); + + verbose_printf("shmid: %d\n", shmid); + + verbose_printf("Spawning children:\n"); + for (i=0; i<numprocs; i++) { + if ((pid = fork()) < 0) + FAIL("fork(): %s", strerror(errno)); + + if (pid == 0) + do_child(i, size); + + wait_list[i] = pid; + } + + for (i=0; i<numprocs; i++) { + waitpid(wait_list[i], &status, 0); + if (WEXITSTATUS(status) != 0) + FAIL("Thread %d (pid=%d) failed", i, wait_list[i]); + + if (WIFSIGNALED(status)) + FAIL("Thread %d (pid=%d) received unhandled signal", + i, wait_list[i]); + } + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/shm-getraw.c b/default/libhugetlbfs/libhugetlbfs/tests/shm-getraw.c new file mode 100644 index 0000000..84a876a --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/shm-getraw.c @@ -0,0 +1,106 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <sys/types.h> +#include <sys/shm.h> +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <fcntl.h> +#include <hugetlbfs.h> +#include "hugetests.h" + +extern int errno; + +/* Global Configuration */ +#define P "shm-getraw" +#define DESC \ + "* This test exercizes the code path which performs raw device IO *\n"\ + "* into a large page backed shared memory segment. The specified *\n"\ + "* device will be read into a shared memory segment. *" + +static int nr_hugepages; +static int shmid = -1; + +void cleanup(void) +{ + remove_shmid(shmid); +} + +int main(int argc, char ** argv) +{ + size_t size; + size_t i; + long hpage_size = check_hugepagesize(); + volatile char *shmaddr; + char *buffer; + int raw_fd; + + test_init(argc, argv); + + check_hugetlb_shm_group(); + + if (argc < 3) + CONFIG("Usage: %s <# pages> <device>", argv[0]); + + nr_hugepages = atoi(argv[1]); + + verbose_printf("hpage_size is: %ld\n", hpage_size); + + buffer = malloc(hpage_size*sizeof(char)); + if (!buffer) + FAIL("malloc(%li)", hpage_size*sizeof(char)); + + raw_fd = open(argv[2], O_RDONLY); + if (!raw_fd) + CONFIG("Cannot open raw device: %s", strerror(errno)); + + size = hpage_size * nr_hugepages; + + verbose_printf("Requesting %zu bytes\n", size); + + if ((shmid = shmget(2, size, SHM_HUGETLB|IPC_CREAT|SHM_R|SHM_W )) < 0) + FAIL("shmget(): %s", strerror(errno)); + + verbose_printf("shmid: 0x%x\n", shmid); + shmaddr = shmat(shmid, 0, SHM_RND) ; + if (shmaddr == MAP_FAILED) + FAIL("shmat() failed: %s", strerror(errno)); + + verbose_printf("shmaddr: %p\n", shmaddr); + + /* Read a page from device and write to shm segment */ + for (i = 0; i < size; i+=hpage_size) { + if (!read(raw_fd, buffer, hpage_size)) + FAIL("Can't read from raw device: %s", + strerror(errno)); + memcpy((char*)(shmaddr + i), buffer, hpage_size); + } + + verbose_printf("Done.\n"); + if (shmdt((const void *)shmaddr) != 0) + FAIL("shmdt() failed: %s", strerror(errno)); + + free(buffer); + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/shm-gettest.c b/default/libhugetlbfs/libhugetlbfs/tests/shm-gettest.c new file mode 100644 index 0000000..a0f17eb --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/shm-gettest.c @@ -0,0 +1,110 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <sys/types.h> +#include <sys/shm.h> +#include <sys/types.h> +#include <sys/mman.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <hugetlbfs.h> +#include "hugetests.h" + +extern int errno; + +/* Global Configuration */ +#define P "shm-gettest" +#define DESC \ + "* A looping test to verify the functionality of large page backed *\n"\ + "* shared memory segments. A segment is created, written, *\n"\ + "* verified, and detached a specified number of times. *" + +static unsigned int iter; +static int nr_hugepages; +static int shmid = -1; +static long hpage_size; + +void cleanup(void) +{ + remove_shmid(shmid); +} + +static void do_one(size_t size) { + size_t i,j; + char pattern; + char *shmaddr; + + verbose_printf("Requesting %zu bytes\n", size); + + if ((shmid = shmget(2, size, SHM_HUGETLB|IPC_CREAT|SHM_R|SHM_W )) < 0) + FAIL("shmget(): %s", strerror(errno)); + + verbose_printf("shmid: 0x%x\n", shmid); + + shmaddr = shmat(shmid, 0, SHM_RND) ; + if (shmaddr == MAP_FAILED) + FAIL("shmat(): %s", strerror(errno)); + + verbose_printf("shmaddr: %p\n", shmaddr); + + for (i = 0; i < nr_hugepages; i++) { + pattern = 65+(i%26); + verbose_printf("Touching %p with %c\n", shmaddr+(i*hpage_size),pattern); + memset(shmaddr+(i*hpage_size), pattern, hpage_size); + } + + for (i = 0; i < nr_hugepages; i++) { + pattern = 65+(i%26); + verbose_printf("Verifying %p\n", (shmaddr+(i*hpage_size))); + for (j = 0; j < hpage_size; j++) + if (*(shmaddr+(i*hpage_size)+j) != pattern) + FAIL("Verifying the segment failed. " + "Got %c, expected %c", + *(shmaddr+(i*hpage_size)+j), pattern); + } + + if (shmdt((const void *)shmaddr) != 0) + FAIL("shmdt(): %s", strerror(errno)); +} + +int main(int argc, char ** argv) +{ + size_t size; + int i; + + test_init(argc, argv); + + if (argc < 3) + CONFIG("Usage: shmgettest <# iterations> <# pages>\n"); + + check_hugetlb_shm_group(); + + iter = atoi(argv[1]); + nr_hugepages = atoi(argv[2]); + + hpage_size = gethugepagesize(); + size = nr_hugepages * hpage_size; + + for (i=0; i < iter; i++) + do_one(size); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/shm-perms.c b/default/libhugetlbfs/libhugetlbfs/tests/shm-perms.c new file mode 100644 index 0000000..590a101 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/shm-perms.c @@ -0,0 +1,131 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <errno.h> +#include <memory.h> +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <sys/shm.h> +#include <sys/types.h> +#include <sys/wait.h> +#include <hugetlbfs.h> +#include "hugetests.h" + +#define P "shm-perms" +#define DESC \ + "* Test shared memory behavior when multiple threads are attached *\n"\ + "* to a segment with different permissions. A segment is created *\n"\ + "* and children attach read-only to check reservation accounting. *" + +#define SEGMENT_SIZE ((size_t)0x4000000) +#define SEGMENT_KEY 0x82ba15ff +#define STRIDE 0x200000 + +static int global_shmid = -1; +void *shm_addr = NULL; + +void cleanup(void) +{ + remove_shmid(global_shmid); +} + +int attach_segment(size_t segsize, int shmflags, int shmperms) +{ + int shmid; + + /* Create/get large segment */ + shmid = shmget(SEGMENT_KEY, segsize, shmflags); + if (shmid == -1) { + perror("shmget(SEGMENT)"); + cleanup(); + exit(EXIT_FAILURE); + } + + /* Attach large segment */ + if ( (shm_addr = shmat(shmid, shm_addr, shmperms)) == (void *)-1) { + perror("shmat(SEGMENT)"); + cleanup(); + exit(EXIT_FAILURE); + } + + global_shmid = shmid; + return shmid; +} + +int main(int argc, char **argv) +{ + char *p; + pid_t *wait_list; + int i, iterations; + long hpage_size = check_hugepagesize(); + long total_hpages = get_huge_page_counter(hpage_size, HUGEPAGES_TOTAL); + + /* Setup */ + test_init(argc, argv); + check_hugetlb_shm_group(); + if (hpage_size > SEGMENT_SIZE) + CONFIG("Page size is too large for configured SEGMENT_SIZE\n"); + check_free_huge_pages(SEGMENT_SIZE / hpage_size); + + iterations = (total_hpages * hpage_size) / SEGMENT_SIZE + 1; + verbose_printf("iterations = %d\n", iterations); + + wait_list = malloc(sizeof(pid_t) * iterations); + if (wait_list == NULL) + FAIL("Failed to allocate wait_list"); + + /* Create, attach and part init segment */ + attach_segment(SEGMENT_SIZE, IPC_CREAT|SHM_HUGETLB|0640, 0); + p = (char *)shm_addr; + for (i = 0; i < 4; i++, p += STRIDE) + memset(p, 0x55, STRIDE); + + /* Detach segment */ + if (shmdt(shm_addr) != 0) + FAIL("shmdt(SEGMENT)"); + + /* Create children to reattach read-only */ + for (i = 0; i < iterations; i++) { + pid_t pid; + pid = fork(); + if (pid == -1) + FAIL("fork"); + + if (pid) { + wait_list[i] = pid; + } else { + attach_segment(0, 0, SHM_RDONLY); + if (shmdt(shm_addr) != 0) { + perror("shmdt(SEGMENT)"); + exit(EXIT_FAILURE); + } + exit(EXIT_SUCCESS); + } + } + + /* Wait for all children to exit */ + for (i = 0; i < iterations; i++) { + int status; + if (waitpid(wait_list[i], &status, 0) == -1) + FAIL("waitpid"); + if (status != EXIT_SUCCESS) + FAIL("Child exited with failure"); + } + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/shmoverride_unlinked.c b/default/libhugetlbfs/libhugetlbfs/tests/shmoverride_unlinked.c new file mode 100644 index 0000000..b82af89 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/shmoverride_unlinked.c @@ -0,0 +1,248 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <sys/types.h> +#include <sys/shm.h> +#include <sys/types.h> +#include <sys/mman.h> +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <ctype.h> +#include <hugetlbfs.h> +#include "hugetests.h" + +/* + * Test Scenario: + * + * libhugetlbfs_shmoverride can be used to force shmget() to use the + * SHM_HUGETLB flag. This test ensures that the flag is correctly used + * based on the value of the environment variable. The assumption is + * made that the library is being preloaded. + */ + +extern int errno; + +/* Global test configuration */ +#define DYNAMIC_SYSCTL "/proc/sys/vm/nr_overcommit_hugepages" +static long saved_nr_hugepages = -1; +static long hpage_size, bpage_size; +static long oc_pool = -1; + +/* Required pool size for test */ +#define POOL_SIZE 4 + +/* State arrays for our mmaps */ +#define NR_SLOTS 1 +#define SL_TEST 0 +static int map_id[NR_SLOTS]; +static char *map_addr[NR_SLOTS]; +static size_t map_size[NR_SLOTS]; + +/* Only ia64 requires this */ +#ifdef __ia64__ +#define ADDR (void *)(0x8000000000000000UL) +#define SHMAT_FLAGS (SHM_RND) +#else +#define ADDR (void *)(0x0UL) +#define SHMAT_FLAGS (0) +#endif + +void _shmmap(int s, int hpages, int bpages, int line) +{ + map_size[s] = hpages * hpage_size + bpages * bpage_size; + map_id[s] = shmget(IPC_PRIVATE, map_size[s], IPC_CREAT | SHM_R | SHM_W); + if (map_id[s] < 0) + FAIL("shmget failed size %zd from line %d: %s", + map_size[s], line, strerror(errno)); + + map_addr[s] = shmat(map_id[s], ADDR, SHMAT_FLAGS); + if (map_addr[s] == (char *)-1) + FAIL("shmmat failed from line %d: %s", line, strerror(errno)); +} +#define shmmap(s, h, b) _shmmap(s, h, b, __LINE__) + +void _shmunmap(int s, int line) +{ + if (shmdt((const void *)map_addr[s]) != 0) { + FAIL("shmdt failed from line %d: %s", line, strerror(errno)); + return; + } + + if (shmctl(map_id[s], IPC_RMID, NULL) == -1) + FAIL("shmctl failed from line %d: %s", line, strerror(errno)); + + map_id[s] = -1; + map_addr[s] = NULL; + map_size[s] = 0; +} +#define shmunmap(s) _shmunmap(s, __LINE__) + +/* + * This test wants to manipulate the hugetlb pool without necessarily linking + * to libhugetlbfs so the helpers for doing this may not be available -- hence + * the duplicated versions below. + * + * NOTE: We use /proc/sys/vm/nr_hugepages and /proc/meminfo for writing and + * reading pool counters because shared memory will always use the system + * default huge page size regardless of any libhugetlbfs settings. + */ +#define MEMINFO_SIZE 2048 +long local_read_meminfo(const char *tag) +{ + int fd; + char buf[MEMINFO_SIZE]; + int len, readerr; + char *p, *q; + long val; + + fd = open("/proc/meminfo", O_RDONLY); + if (fd < 0) + FAIL("Couldn't open /proc/meminfo: %s\n", strerror(errno)); + + len = read(fd, buf, sizeof(buf)); + readerr = errno; + close(fd); + if (len < 0) + FAIL("Error reading /proc/meminfo: %s\n", strerror(errno)); + + if (len == sizeof(buf)) + FAIL("/proc/meminfo is too large\n"); + buf[len] = '\0'; + + p = strstr(buf, tag); + if (!p) + FAIL("Tag %s not found in /proc/meminfo\n", tag); + p += strlen(tag); + + val = strtol(p, &q, 0); + if (!isspace(*q)) + FAIL("Couldn't parse /proc/meminfo\n"); + + return val; +} + +void setup_hugetlb_pool(unsigned long count) +{ + FILE *fd; + unsigned long poolsize; + count += local_read_meminfo("HugePages_Rsvd:"); + fd = fopen("/proc/sys/vm/nr_hugepages", "w"); + if (!fd) + CONFIG("Cannot open nr_hugepages for writing\n"); + fprintf(fd, "%lu", count); + fclose(fd); + + /* Confirm the resize worked */ + poolsize = local_read_meminfo("HugePages_Total:"); + if (poolsize != count) + FAIL("Failed to resize pool to %lu pages. Got %lu instead\n", + count, poolsize); +} + +void local_check_free_huge_pages(int needed_pages) +{ + int free = local_read_meminfo("HugePages_Free:"); + if (free < needed_pages) + CONFIG("Must have at least %i free hugepages", needed_pages); +} + +void run_test(char *desc, int hpages, int bpages, int pool_nr, int expect_diff) +{ + long resv_before, resv_after; + verbose_printf("%s...\n", desc); + setup_hugetlb_pool(pool_nr); + + /* untouched, shared mmap */ + resv_before = local_read_meminfo("HugePages_Rsvd:"); + shmmap(SL_TEST, hpages, bpages); + resv_after = local_read_meminfo("HugePages_Rsvd:"); + memset(map_addr[SL_TEST], 0, map_size[SL_TEST]); + shmunmap(SL_TEST); + + if (resv_after - resv_before != expect_diff) + FAIL("%s: Reserve page count did not adjust by %d page. " + "Expected %li reserved pages but got %li pages", + desc, expect_diff, + resv_before + expect_diff, resv_after); +} + +void cleanup(void) +{ + int i; + + /* Clean up any allocated shmids */ + for (i = 0; i < NR_SLOTS; i++) + if (map_id[i] > 0) + shmctl(map_id[i], IPC_RMID, NULL); + + /* Restore the pool size. */ + if (saved_nr_hugepages >= 0) + setup_hugetlb_pool(saved_nr_hugepages); + + if (oc_pool > 0) + restore_overcommit_pages(hpage_size, oc_pool); +} + +int main(int argc, char **argv) +{ + char *env; + + test_init(argc, argv); + check_must_be_root(); + local_check_free_huge_pages(POOL_SIZE); + saved_nr_hugepages = local_read_meminfo("HugePages_Total:"); + + /* + * We cannot call check_hugepagesize because we are not linked to + * libhugetlbfs. This is a bit hacky but we are depending on earlier + * tests failing to catch when this wouldn't work + */ + hpage_size = local_read_meminfo("Hugepagesize:") * 1024; + bpage_size = getpagesize(); + oc_pool = read_nr_overcommit(hpage_size); + if (oc_pool > 0) + set_nr_overcommit_hugepages(hpage_size, 0); + + env = getenv("HUGETLB_SHM"); + + /* Now that all env parsing is in one location and is only done once + * during library init, we cannot modify the value of HGUETLB_SHM + * in the middle of the test, instead run the tests that fit with + * the current value of HUGETLB_SHM + */ + if (env && strcasecmp(env, "yes") == 0) { + /* Run the test with large pages */ + run_test("override-requested-aligned", 1, 0, POOL_SIZE, 1); + + /* Run the test with large pages but with an unaligned size */ + run_test("override-requested-unaligned", 1, 1, POOL_SIZE, 2); + + /* Run the test with no pool but requested large pages */ + setup_hugetlb_pool(0); + run_test("override-requested-aligned-nopool", 1, 0, 0, 0); + } else { + /* Run the test with small pages */ + run_test("override-not-requested-aligned", 1, 0, POOL_SIZE, 0); + } + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/slbpacaflush.c b/default/libhugetlbfs/libhugetlbfs/tests/slbpacaflush.c new file mode 100644 index 0000000..8893c4d --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/slbpacaflush.c @@ -0,0 +1,96 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +/* Test rationale: + * + * ppc64 kernels (prior to 2.6.15-rc5) have a bug in the hugepage SLB + * flushing path. After opening new hugetlb areas, we update the + * masks in the thread_struct, copy to the PACA, then do slbies on + * each CPU. The trouble is we only copy to the PACA on the CPU where + * we're opening the segments, which can leave a stale copy in the + * PACAs on other CPUs. + * + * This can be triggered either with multiple threads sharing the mm, + * or with a single thread which is migrated from one CPU, to another + * (where the mapping occurs), then back again (where we touch the + * stale SLB). We use the second method in this test, since it's + * easier to force (using sched_setaffinity). However it relies on a + * close-to-idle system, if any process other than a kernel thread + * runs on the first CPU between runs of the test process, the SLB + * will be flushed and we won't trigger the bug, hence the + * PASS_INCONCLUSIVE(). Obviously, this test won't work on a 1-cpu + * system (should get CONFIG() on the sched_setaffinity()). + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <sys/mman.h> +#include <sched.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p; + volatile unsigned long *q; + int err; + cpu_set_t cpu0, cpu1; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + CPU_ZERO(&cpu0); + CPU_SET(0, &cpu0); + CPU_ZERO(&cpu1); + CPU_SET(1, &cpu1); + + err = sched_setaffinity(getpid(), CPU_SETSIZE/8, &cpu0); + if (err != 0) + CONFIG("sched_setaffinity(cpu0): %s", strerror(errno)); + + err = sched_setaffinity(getpid(), CPU_SETSIZE/8, &cpu1); + if (err != 0) + CONFIG("sched_setaffinity(): %s", strerror(errno)); + + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + + err = sched_setaffinity(getpid(), CPU_SETSIZE/8, &cpu0); + if (err != 0) + CONFIG("sched_setaffinity(cpu0): %s", strerror(errno)); + + q = (volatile unsigned long *)(p + getpagesize()); + *q = 0xdeadbeef; + + PASS_INCONCLUSIVE(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/stack_grow_into_huge.c b/default/libhugetlbfs/libhugetlbfs/tests/stack_grow_into_huge.c new file mode 100644 index 0000000..f2aff74 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/stack_grow_into_huge.c @@ -0,0 +1,140 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <sys/mman.h> +#include <sys/resource.h> +#include <sys/wait.h> + +#include <hugetlbfs.h> +#include "hugetests.h" + +/* + * Test rationale: + * + * On PowerPC, the address space is divided into segments. These segments can + * contain either huge pages or normal pages, but not both. All segments are + * initially set up to map normal pages. When a huge page mapping is created + * within a set of empty segments, they are "enabled" for huge pages at that + * time. Once enabled for huge pages, they can not be used again for normal + * pages for the remaining lifetime of the process. + * + * If the segment immediately preceeding the segment containing the stack is + * converted to huge pages and the stack is made to grow into the this + * preceeding segment, some kernels may attempt to map normal pages into the + * huge page-only segment -- resulting in bugs. + * + * The kernel bug in question was fixed by commit + * 0d59a01bc461bbab4017ff449b8401151ef44cf6. + */ + +#ifdef __LP64__ +#define STACK_ALLOCATION_SIZE (256*1024*1024) +#else +#define STACK_ALLOCATION_SIZE (16*1024*1024) +#endif + +void do_child(void *stop_address) +{ + volatile int *x; + do { + x = alloca(STACK_ALLOCATION_SIZE); + *x = 1; + } while ((void *)x >= stop_address); +} + +int main(int argc, char *argv[]) +{ + int fd, pid, s, ret; + struct rlimit r; + char *b; + long hpage_size = gethugepagesize(); + void *stack_address, *mmap_address, *heap_address; + + test_init(argc, argv); + + ret = getrlimit(RLIMIT_STACK, &r); + if (ret) + CONFIG("getrlimit failed: %s", strerror(errno)); + + if (r.rlim_cur != RLIM_INFINITY) + CONFIG("Stack rlimit must be 'unlimited'"); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + CONFIG("Couldn't get hugepage fd"); + + stack_address = alloca(0); + heap_address = sbrk(0); + + /* + * paranoia: start mapping two hugepages below the start of the stack, + * in case the alignment would cause us to map over something if we + * only used a gap of one hugepage. + */ + mmap_address = PALIGN(stack_address - 2 * hpage_size, hpage_size); + + do { + b = mmap(mmap_address, hpage_size, PROT_READ|PROT_WRITE, + MAP_FIXED|MAP_SHARED, fd, 0); + mmap_address -= hpage_size; + /* + * if we get all the way down to the heap, stop trying + */ + if (mmap_address <= heap_address) + break; + } while (b == MAP_FAILED); + + if (b == MAP_FAILED) + FAIL("mmap: %s", strerror(errno)); + + if ((pid = fork()) < 0) + FAIL("fork: %s", strerror(errno)); + + if (pid == 0) { + do_child(mmap_address); + exit(0); + } + + ret = waitpid(pid, &s, 0); + if (ret == -1) + FAIL("waitpid: %s", strerror(errno)); + + /* + * The child grows its stack until a failure occurs. We expect + * this to result in a SIGSEGV. If any other signal is + * delivered (ie. SIGTRAP) or no signal is sent at all, we + * determine the kernel has not behaved correctly and trigger a + * test failure. + */ + if (WIFSIGNALED(s)) { + int sig = WTERMSIG(s); + + if (sig == SIGSEGV) { + PASS(); + } else { + FAIL("Got unexpected signal: %s", strsignal(sig)); + } + } + FAIL("Child not signalled"); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/straddle_4GB.c b/default/libhugetlbfs/libhugetlbfs/tests/straddle_4GB.c new file mode 100644 index 0000000..da59fbf --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/straddle_4GB.c @@ -0,0 +1,108 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p; + unsigned long straddle_addr; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + if (sizeof(void *) <= 4) + TEST_BUG("64-bit only"); + + if (hpage_size > FOURGB) + CONFIG("Huge page size too large"); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + straddle_addr = FOURGB - hpage_size; + + /* We first try to get the mapping without MAP_FIXED */ + verbose_printf("Mapping without MAP_FIXED at %lx...", straddle_addr); + p = mmap((void *)straddle_addr, 2*hpage_size, PROT_READ|PROT_WRITE, + MAP_SHARED, fd, 0); + if (p == (void *)straddle_addr) { + /* These tests irrelevant if we didn't get the + * straddle address */ + verbose_printf("done\n"); + + if (test_addr_huge(p) != 1) + FAIL("Mapped address is not hugepage"); + + if (test_addr_huge(p + hpage_size) != 1) + FAIL("Mapped address is not hugepage"); + + verbose_printf("Clearing below 4GB..."); + memset(p, 0, hpage_size); + verbose_printf("done\n"); + + verbose_printf("Clearing above 4GB..."); + memset(p + hpage_size, 0, hpage_size); + verbose_printf("done\n"); + } else { + verbose_printf("got %p instead, never mind\n", p); + munmap(p, 2*hpage_size); + } + + verbose_printf("Mapping with MAP_FIXED at %lx...", straddle_addr); + p = mmap((void *)straddle_addr, 2*hpage_size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_FIXED, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap() FIXED: %s", strerror(errno)); + if (p != (void *)straddle_addr) { + verbose_printf("got %p instead\n", p); + FAIL("Wrong address with MAP_FIXED"); + } + verbose_printf("done\n"); + + if (test_addr_huge(p) != 1) + FAIL("Mapped address is not hugepage"); + + if (test_addr_huge(p + hpage_size) != 1) + FAIL("Mapped address is not hugepage"); + + verbose_printf("Clearing below 4GB..."); + memset(p, 0, hpage_size); + verbose_printf("done\n"); + + verbose_printf("Clearing above 4GB..."); + memset(p + hpage_size, 0, hpage_size); + verbose_printf("done\n"); + + verbose_printf("Tested above 4GB\n"); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/task-size-overrun.c b/default/libhugetlbfs/libhugetlbfs/tests/task-size-overrun.c new file mode 100644 index 0000000..0140277 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/task-size-overrun.c @@ -0,0 +1,131 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _LARGEFILE64_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <sys/mman.h> +#include <errno.h> +#include <assert.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +#define MAPS_BUF_SZ 4096 + +static unsigned long find_last_mapped(void) +{ + FILE *f; + char line[MAPS_BUF_SZ]; + char *tmp; + unsigned long start, end, off, ino; + int ret; + + f = fopen("/proc/self/maps", "r"); + if (!f) { + ERROR("Failed to open /proc/self/maps: %s\n", strerror(errno)); + return -1; + } + + do { + tmp = fgets(line, MAPS_BUF_SZ, f); + } while (tmp); + fclose(f); + + verbose_printf("Last map: %s", line); + ret = sscanf(line, "%lx-%lx %*s %lx %*s %ld %*s", &start, &end, &off, &ino); + if (ret == EOF) + FAIL("Couldn't parse /proc/self/maps line: %s: %s\n", line, + strerror(errno)); + if (ret != 4) + FAIL("Couldn't parse /proc/self/maps line: %s\n", line); + + verbose_printf("Last map at 0x%lx-0x%lx\n", start, end); + return end; +} + +static unsigned long find_task_size(void) +{ + unsigned long addr; + void *p; + + addr = find_last_mapped(); + if (!addr || ((addr % getpagesize()) != 0)) + FAIL("Bogus stack end address, 0x%lx!?", addr); + + while (addr) { + p = mmap64((void *)addr, getpagesize(), PROT_READ, + MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0); + if (p == MAP_FAILED) { + verbose_printf("Searching map failed: %s\n", strerror(errno)); + return addr; + } + munmap(p, getpagesize()); + addr += getpagesize(); + } + /* addr wrapped around */ + return 0; +} + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p; + unsigned long task_size; + unsigned long straddle_addr; + + test_init(argc, argv); + + task_size = find_task_size(); + + verbose_printf("TASK_SIZE = 0x%lx\n", task_size); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + straddle_addr = task_size - hpage_size; + straddle_addr = ALIGN(straddle_addr, hpage_size); + + /* We first try to get the mapping without MAP_FIXED */ + verbose_printf("Mapping without MAP_FIXED at %lx...", straddle_addr); + errno = 0; + p = mmap((void *)straddle_addr, 2*hpage_size, PROT_READ|PROT_WRITE, + MAP_SHARED, fd, 0); + verbose_printf("%s\n", strerror(errno)); + if (p == (void *)straddle_addr) + FAIL("Apparently suceeded in mapping across TASK_SIZE boundary"); + + verbose_printf("Mapping with MAP_FIXED at %lx...", straddle_addr); + errno = 0; + p = mmap((void *)straddle_addr, 2*hpage_size, PROT_READ|PROT_WRITE, + MAP_SHARED|MAP_FIXED, fd, 0); + verbose_printf("%s\n", strerror(errno)); + if (p != MAP_FAILED) + FAIL("Apparently suceeded in mapping across TASK_SIZE boundary"); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/test_root.c b/default/libhugetlbfs/libhugetlbfs/tests/test_root.c new file mode 100644 index 0000000..a6c842c --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/test_root.c @@ -0,0 +1,39 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include <stdio.h> +#include <stdlib.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +int main(int argc, char *argv[]) +{ + int val; + + test_init(argc, argv); + + val = hugetlbfs_test_path("/"); + + if (val) + FAIL("/ reports as hugetlbfs"); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/testutils.c b/default/libhugetlbfs/libhugetlbfs/tests/testutils.c new file mode 100644 index 0000000..68d8e62 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/testutils.c @@ -0,0 +1,298 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _LARGEFILE64_SOURCE +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <limits.h> +#include <string.h> +#include <errno.h> +#include <ctype.h> +#include <unistd.h> +#include <signal.h> +#include <sys/types.h> +#include <sys/vfs.h> +#include <sys/ipc.h> +#include <sys/shm.h> +#include <sys/stat.h> +#include <sys/mman.h> +#include <fcntl.h> + +#include "hugetlbfs.h" +#include "hugetests.h" + +#define HUGETLBFS_MAGIC 0x958458f6 +#define BUF_SZ 1024 +#define MEMINFO_SZ 2048 + +int verbose_test = 1; +char *test_name; + +void check_must_be_root(void) +{ + uid_t uid = getuid(); + if (uid != 0) + CONFIG("Must be root"); +} + +void check_hugetlb_shm_group(void) +{ + int fd; + ssize_t ret; + char gid_buffer[64] = {0}; + gid_t hugetlb_shm_group; + gid_t gid = getgid(); + uid_t uid = getuid(); + + /* root is an exception */ + if (uid == 0) + return; + + fd = open("/proc/sys/vm/hugetlb_shm_group", O_RDONLY); + if (fd < 0) + ERROR("Unable to open /proc/sys/vm/hugetlb_shm_group: %s", + strerror(errno)); + ret = read(fd, &gid_buffer, sizeof(gid_buffer)); + if (ret < 0) + ERROR("Unable to read /proc/sys/vm/hugetlb_shm_group: %s", + strerror(errno)); + hugetlb_shm_group = atoi(gid_buffer); + close(fd); + if (hugetlb_shm_group != gid) + CONFIG("Do not have permission to use SHM_HUGETLB"); +} + +void __attribute__((weak)) cleanup(void) +{ +} + +#if 0 +static void segv_handler(int signum, siginfo_t *si, void *uc) +{ + FAIL("Segmentation fault"); +} +#endif + +static void sigint_handler(int signum, siginfo_t *si, void *uc) +{ + cleanup(); + fprintf(stderr, "%s: %s (pid=%d)\n", test_name, + strsignal(signum), getpid()); + exit(RC_BUG); +} + +void test_init(int argc, char *argv[]) +{ + int err; + struct sigaction sa_int = { + .sa_sigaction = sigint_handler, + }; + + test_name = argv[0]; + + err = sigaction(SIGINT, &sa_int, NULL); + if (err) + FAIL("Can't install SIGINT handler: %s", strerror(errno)); + + if (getenv("QUIET_TEST")) + verbose_test = 0; + + verbose_printf("Starting testcase \"%s\", pid %d\n", + test_name, getpid()); +} + +#define MAPS_BUF_SZ 4096 + +static int read_maps(unsigned long addr, char *buf) +{ + FILE *f; + char line[MAPS_BUF_SZ]; + char *tmp; + + f = fopen("/proc/self/maps", "r"); + if (!f) { + ERROR("Failed to open /proc/self/maps: %s\n", strerror(errno)); + return -1; + } + + while (1) { + unsigned long start, end, off, ino; + int ret; + + tmp = fgets(line, MAPS_BUF_SZ, f); + if (!tmp) + break; + + buf[0] = '\0'; + ret = sscanf(line, "%lx-%lx %*s %lx %*s %ld %255s", + &start, &end, &off, &ino, + buf); + if ((ret < 4) || (ret > 5)) { + ERROR("Couldn't parse /proc/self/maps line: %s\n", + line); + fclose(f); + return -1; + } + + if ((start <= addr) && (addr < end)) { + fclose(f); + return 1; + } + } + + fclose(f); + return 0; +} + +/* + * With the inclusion of MAP_HUGETLB it is now possible to have huge pages + * without using hugetlbfs, so not all huge page regions will show with the + * test that reads /proc/self/maps. Instead we ask /proc/self/smaps for + * the KernelPageSize. On success we return the page size (in bytes) for the + * mapping that contains addr, on failure we return 0 + */ +unsigned long long get_mapping_page_size(void *p) +{ + FILE *f; + char line[MAPS_BUF_SZ]; + char *tmp; + unsigned long addr = (unsigned long)p; + + f = fopen("/proc/self/smaps", "r"); + if (!f) { + ERROR("Unable to open /proc/self/smaps\n"); + return 0; + } + + while ((tmp = fgets(line, MAPS_BUF_SZ, f))) { + unsigned long start, end, dummy; + char map_name[256]; + char buf[64]; + int ret; + + ret = sscanf(line, "%lx-%lx %s %lx %s %ld %s", &start, &end, + buf, &dummy, buf, &dummy, map_name); + if (ret < 7 || start > addr || end <= addr) + continue; + + while ((tmp = fgets(line, MAPS_BUF_SZ, f))) { + unsigned long long page_size; + + ret = sscanf(line, "KernelPageSize: %lld kB", + &page_size); + if (ret == 0 ) + continue; + if (ret < 1 || page_size <= 0) { + ERROR("Cannot parse /proc/self/smaps\n"); + page_size = 0; + } + + fclose(f); + /* page_size is reported in kB, we return B */ + return page_size * 1024; + } + } + + /* We couldn't find an entry for this addr in smaps */ + fclose(f); + return 0; +} + +/* We define this function standalone, rather than in terms of + * hugetlbfs_test_path() so that we can use it without -lhugetlbfs for + * testing PRELOAD */ +int test_addr_huge(void *p) +{ + char name[256]; + char *dirend; + int ret; + struct statfs64 sb; + + ret = read_maps((unsigned long)p, name); + if (ret < 0) + return ret; + if (ret == 0) { + verbose_printf("Couldn't find address %p in /proc/self/maps\n", + p); + return -1; + } + + /* looks like a filename? */ + if (name[0] != '/') + return 0; + + /* Truncate the filename portion */ + + dirend = strrchr(name, '/'); + if (dirend && dirend > name) { + *dirend = '\0'; + } + + ret = statfs64(name, &sb); + if (ret) + return -1; + + return (sb.f_type == HUGETLBFS_MAGIC); +} + +ino_t get_addr_inode(void *p) +{ + char name[256]; + int ret; + struct stat sb; + + ret = read_maps((unsigned long)p, name); + if (ret < 0) + return ret; + if (ret == 0) { + ERROR("Couldn't find address %p in /proc/self/maps\n", p); + return -1; + } + + /* Don't care about non-filenames */ + if (name[0] != '/') + return 0; + + /* Truncate the filename portion */ + + ret = stat(name, &sb); + if (ret < 0) { + /* Don't care about unlinked files */ + if (errno == ENOENT) + return 0; + ERROR("stat failed: %s\n", strerror(errno)); + return -1; + } + + return sb.st_ino; +} + +int remove_shmid(int shmid) +{ + if (shmid >= 0) { + if (shmctl(shmid, IPC_RMID, NULL) != 0) { + ERROR("shmctl(%x, IPC_RMID) failed (%s)\n", + shmid, strerror(errno)); + return -1; + } + } + return 0; +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/truncate.c b/default/libhugetlbfs/libhugetlbfs/tests/truncate.c new file mode 100644 index 0000000..a45c8c4 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/truncate.c @@ -0,0 +1,79 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <signal.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +#define RANDOM_CONSTANT 0x1234ABCD + +static void sigbus_handler(int signum, siginfo_t *si, void *uc) +{ + PASS(); +} + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p; + volatile unsigned int *q; + int err; + struct sigaction sa = { + .sa_sigaction = sigbus_handler, + .sa_flags = SA_SIGINFO, + }; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + + q = p; + + /* Touch the memory */ + *q = 0; + + err = sigaction(SIGBUS, &sa, NULL); + if (err) + FAIL("sigaction(): %s", strerror(errno)); + + + err = ftruncate(fd, 0); + if (err) + FAIL("ftruncate(): %s", strerror(errno)); + + *q; + + /* Should have SIGBUSed above */ + FAIL("Didn't SIGBUS"); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/truncate_above_4GB.c b/default/libhugetlbfs/libhugetlbfs/tests/truncate_above_4GB.c new file mode 100644 index 0000000..5b8c08f --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/truncate_above_4GB.c @@ -0,0 +1,157 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * Copyright (C) 2006 Hugh Dickins <hugh@xxxxxxxxxxx> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#define _LARGEFILE64_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <signal.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +/* + * Test rationale: + * + * At one stage, a misconversion of hugetlb_vmtruncate_list to a + * prio_tree meant that on 32-bit machines, truncates at or above 4GB + * could truncate lower pages, resulting in BUG_ON()s. + * + * WARNING: The offsets and addresses used within are specifically + * calculated to trigger the bug as it existed. Don't mess with them + * unless you *really* know what you're doing. + * + * The kernel bug in question was fixed with commit + * 856fc29505556cf263f3dcda2533cf3766c14ab6. + */ +#define FOURGIG ((off64_t)0x100000000ULL) + +static void sigbus_handler_fail(int signum, siginfo_t *si, void *uc) +{ + FAIL("Unexpected SIGBUS"); +} + +static void sigbus_handler_pass(int signum, siginfo_t *si, void *uc) +{ + PASS(); +} + +int main(int argc, char *argv[]) +{ + int page_size; + long hpage_size; + long long buggy_offset, truncate_point; + int fd; + void *p, *q; + volatile unsigned int *pi, *qi; + int err; + struct sigaction sa_fail = { + .sa_sigaction = sigbus_handler_fail, + .sa_flags = SA_SIGINFO, + }; + struct sigaction sa_pass = { + .sa_sigaction = sigbus_handler_pass, + .sa_flags = SA_SIGINFO, + }; + + test_init(argc, argv); + + page_size = getpagesize(); + hpage_size = check_hugepagesize(); + + check_free_huge_pages(3); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + truncate_point = FOURGIG; + buggy_offset = truncate_point / (hpage_size / page_size); + buggy_offset = ALIGN(buggy_offset, hpage_size); + + verbose_printf("Mapping 3 hpages at offset 0x%llx...", truncate_point); + /* First get arena of three hpages size, at file offset 4GB */ + q = mmap64(NULL, 3*hpage_size, PROT_READ|PROT_WRITE, + MAP_PRIVATE, fd, truncate_point); + if (q == MAP_FAILED) + FAIL("mmap() offset 4GB: %s", strerror(errno)); + verbose_printf("mapped at %p\n", q); + qi = q; + /* Touch the high page */ + *qi = 0; + + /* This part of the test makes the problem more obvious, but + * is not essential. It can't be done on powerpc, where + * segment restrictions prohibit us from performing such a + * mapping, so skip it there. Similarly, ia64's address space + * restrictions prevent this. */ +#if !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__ia64__) + /* Replace middle hpage by tinypage mapping to trigger + * nr_ptes BUG */ + verbose_printf("Replacing map at %p-%p...", q + hpage_size, + q + hpage_size + hpage_size-1); + p = mmap64(q + hpage_size, hpage_size, PROT_READ|PROT_WRITE, + MAP_FIXED|MAP_PRIVATE|MAP_ANON, -1, 0); + if (p != q + hpage_size) + FAIL("mmap() before low hpage"); + verbose_printf("done\n"); + pi = p; + /* Touch one page to allocate its page table */ + *pi = 0; +#endif + + /* Replace top hpage by hpage mapping at confusing file offset */ + verbose_printf("Replacing map at %p with map from offset 0x%llx...", + q + 2*hpage_size, buggy_offset); + p = mmap64(q + 2*hpage_size, hpage_size, PROT_READ|PROT_WRITE, + MAP_FIXED|MAP_PRIVATE, fd, buggy_offset); + if (p != q + 2*hpage_size) + FAIL("mmap() buggy offset 0x%llx", buggy_offset); + verbose_printf("done\n"); + pi = p; + /* Touch the low page with something non-zero */ + *pi = 1; + + verbose_printf("Truncating at 0x%llx...", truncate_point); + err = ftruncate64(fd, truncate_point); + if (err) + FAIL("ftruncate(): %s", strerror(errno)); + verbose_printf("done\n"); + + err = sigaction(SIGBUS, &sa_fail, NULL); + if (err) + FAIL("sigaction() fail: %s", strerror(errno)); + + if (*pi != 1) + FAIL("Data 1 has changed to %u", *pi); + + err = sigaction(SIGBUS, &sa_pass, NULL); + if (err) + FAIL("sigaction() pass: %s", strerror(errno)); + + *qi; + + /* Should have SIGBUSed above */ + FAIL("Didn't SIGBUS on truncated page."); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/truncate_reserve_wraparound.c b/default/libhugetlbfs/libhugetlbfs/tests/truncate_reserve_wraparound.c new file mode 100644 index 0000000..0e27787 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/truncate_reserve_wraparound.c @@ -0,0 +1,130 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <unistd.h> +#include <signal.h> +#include <sys/mman.h> +#include <setjmp.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +/* + * Test rationale: + * + * At one stage, improper handling of tests against i_size could mess + * up accounting of reserved hugepages on certain truncate + * operations. + * + * This bug was fixed with a band-aid (enough to pass this test) in + * commit ebed4bfc8da8df5b6b0bc4a5064a949f04683509. A more complete + * fix still pending as of 3d4248885b9fca818e7fe6b66328e714876d36ad. + */ + +#define RANDOM_CONSTANT 0x1234ABCD + +static sigjmp_buf sig_escape; + +static void sigbus_handler(int signum, siginfo_t *si, void *uc) +{ + siglongjmp(sig_escape, 17); +} + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p; + volatile unsigned int *q; + int err; + int sigbus_count = 0; + unsigned long initial_rsvd, rsvd; + struct sigaction sa = { + .sa_sigaction = sigbus_handler, + .sa_flags = SA_SIGINFO, + }; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + initial_rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count before map: %lu\n", initial_rsvd); + + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + q = p; + + verbose_printf("Reserve count after map: %lu\n", + get_huge_page_counter(hpage_size, HUGEPAGES_RSVD)); + + *q = 0; + verbose_printf("Reserve count after touch: %lu\n", + get_huge_page_counter(hpage_size, HUGEPAGES_RSVD)); + + err = ftruncate(fd, 0); + if (err) + FAIL("ftruncate(): %s", strerror(errno)); + + rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count after truncate: %lu\n", rsvd); + if (rsvd != initial_rsvd) + FAIL("Reserved count is not restored after truncate: %lu instead of %lu", + rsvd, initial_rsvd); + + err = sigaction(SIGBUS, &sa, NULL); + if (err) + FAIL("sigaction(): %s", strerror(errno)); + + if (sigsetjmp(sig_escape, 1) == 0) + *q; /* Fault, triggering a SIGBUS */ + else + sigbus_count++; + + if (sigbus_count != 1) + FAIL("Didn't SIGBUS after truncate"); + + rsvd = get_huge_page_counter(hpage_size, HUGEPAGES_RSVD); + verbose_printf("Reserve count after SIGBUS fault: %lu\n", rsvd); + if (rsvd != initial_rsvd) + FAIL("Reserved count is altered by SIGBUS fault: %lu instead of %lu", + rsvd, initial_rsvd); + + munmap(p, hpage_size); + + verbose_printf("Reserve count after munmap(): %lu\n", + get_huge_page_counter(hpage_size, HUGEPAGES_RSVD)); + + close(fd); + + verbose_printf("Reserve count after close(): %lu\n", + get_huge_page_counter(hpage_size, HUGEPAGES_RSVD)); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/truncate_sigbus_versus_oom.c b/default/libhugetlbfs/libhugetlbfs/tests/truncate_sigbus_versus_oom.c new file mode 100644 index 0000000..7aa2fe5 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/truncate_sigbus_versus_oom.c @@ -0,0 +1,100 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <signal.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +/* + * Test rationale: + * + * Some kernel have a bug in the positioning of the test against + * i_size. This bug means that attempting to instantiate a page + * beyond the end of a hugepage file can result in an OOM and SIGKILL + * instead of the correct SIGBUS. + * + * This bug was fixed by commit ebed4bfc8da8df5b6b0bc4a5064a949f04683509. + */ +static void sigbus_handler(int signum, siginfo_t *si, void *uc) +{ + PASS(); +} + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd, fdx; + unsigned long totpages; + void *p, *q; + int i; + int err; + struct sigaction sa = { + .sa_sigaction = sigbus_handler, + .sa_flags = SA_SIGINFO, + }; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + totpages = get_huge_page_counter(hpage_size, HUGEPAGES_TOTAL); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + err = ftruncate(fd, 0); + if (err) + FAIL("ftruncate(): %s", strerror(errno)); + + /* Now slurp up all the available pages */ + fdx = hugetlbfs_unlinked_fd(); + if (fdx < 0) + FAIL("hugetlbfs_unlinked_fd() 2"); + + q = mmap(NULL, totpages * hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fdx, 0); + if (q == MAP_FAILED) + FAIL("mmap() reserving all pages: %s", strerror(errno)); + + /* Touch the pages to ensure they're removed from the pool */ + for (i = 0; i < totpages; i++) { + volatile char *x = (volatile char *)q + i*hpage_size; + *x = 0; + } + + /* SIGBUS is what *should* happen */ + err = sigaction(SIGBUS, &sa, NULL); + if (err) + FAIL("sigaction(): %s", strerror(errno)); + + *((volatile unsigned int *)p); + + /* Should have SIGBUSed above, or (failed the test) with SIGKILL */ + FAIL("Didn't SIGBUS or OOM"); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/unlinked_fd.c b/default/libhugetlbfs/libhugetlbfs/tests/unlinked_fd.c new file mode 100644 index 0000000..98bd4ee --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/unlinked_fd.c @@ -0,0 +1,60 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <sys/mman.h> + +#include <hugetlbfs.h> + +#include "hugetests.h" + +int main(int argc, char *argv[]) +{ + long hpage_size; + int fd; + void *p; + int err; + + test_init(argc, argv); + + hpage_size = check_hugepagesize(); + + fd = hugetlbfs_unlinked_fd(); + if (fd < 0) + FAIL("hugetlbfs_unlinked_fd()"); + + p = mmap(NULL, hpage_size, PROT_READ|PROT_WRITE, MAP_SHARED, + fd, 0); + if (p == MAP_FAILED) + FAIL("mmap(): %s", strerror(errno)); + + err = test_addr_huge(p); + if (err != 1) + FAIL("Mapped address is not hugepage"); + + err = munmap(p, hpage_size); + if (err != 0) + FAIL("munmap(): %s", strerror(errno)); + + if (close(fd)) + FAIL("close(): %s", strerror(errno)); + + PASS(); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/wrapper-utils.sh b/default/libhugetlbfs/libhugetlbfs/tests/wrapper-utils.sh new file mode 100644 index 0000000..2f6451d --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/wrapper-utils.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +# Standard return codes +RC_PASS=0 +RC_CONFIG=1 +RC_FAIL=2 +RC_XFAIL=3 +RC_XPASS=4 +RC_BUG=99 + +function unexpected_pass() +{ + echo -n "UNEXPECTED " +} + +function expected_fail() +{ + echo -n "EXPECTED " +} + +# check_rc (<expected return code>, <actual return code>) +# Returns: Adjusted return code +# +# Check the actual and expected return codes to identify +# expected failures and unexpected passes. +function check_rc() +{ + EXP_RC=$1 + ACT_RC=$2 + + if [ $ACT_RC -eq $RC_PASS -a $EXP_RC -ne $RC_PASS ]; then + unexpected_pass + return $RC_XPASS + elif [ $EXP_RC -ne $RC_PASS -a $EXP_RC -eq $ACT_RC ]; then + expected_fail + return $RC_XFAIL + else + return $ACT_RC + fi +} + +# exec_and_check (<expected return code>, <command-line ...>) +# Does not return +# Execute a test command and check for expected failures and unexpected passes. +function exec_and_check() +{ + EXP_RC=$1 + shift + + OUTPUT=`$@` + check_rc $EXP_RC $? + RC=$? + echo $OUTPUT + + exit $RC +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/zero_filesize_segment.c b/default/libhugetlbfs/libhugetlbfs/tests/zero_filesize_segment.c new file mode 100644 index 0000000..22f52f1 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/zero_filesize_segment.c @@ -0,0 +1,60 @@ +/* + * libhugetlbfs - Easy use of Linux hugepages + * Copyright (C) 2005-2006 David Gibson & Adam Litke, IBM Corporation. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public License + * as published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#define _GNU_SOURCE + +#include <stdio.h> +#include <stdlib.h> +#include <link.h> + +#include "hugetests.h" + +static int parse_phdrs(struct dl_phdr_info *info, size_t size, void *data) +{ + int i; + /* This should only be iterated once - we assume that the + * first iteration is the phdrs for the main executable */ + + for (i = 0; i < info->dlpi_phnum; i++) { + const ElfW(Phdr) *phdr = &info->dlpi_phdr[i]; + + if (phdr->p_type != PT_LOAD) + continue; + + verbose_printf("PHDR %d: filesz = 0x%lx, memsz = 0x%lx\n", + i, (unsigned long)phdr->p_filesz, + (unsigned long)phdr->p_memsz); + if (phdr->p_filesz == 0) + PASS(); + } + + return 1; +} + +int main(int argc, char *argv[]) +{ + test_init(argc, argv); + + /* If we're even able to load, that's a good start, but lets + * verify that we really do have a segment with + * zero-filesize. */ + dl_iterate_phdr(parse_phdrs, NULL); + + FAIL("Couldn't find zero filesize segment (test misbuilt)"); +} diff --git a/default/libhugetlbfs/libhugetlbfs/tests/zero_filesize_segment.ld b/default/libhugetlbfs/libhugetlbfs/tests/zero_filesize_segment.ld new file mode 100644 index 0000000..7f2fe12 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/tests/zero_filesize_segment.ld @@ -0,0 +1,7 @@ +SECTIONS +{ + .empty (0x20000000) : { + __empty_segment = .; + . = . + 4; + } +} diff --git a/default/libhugetlbfs/libhugetlbfs/version.c b/default/libhugetlbfs/libhugetlbfs/version.c new file mode 100644 index 0000000..0ab886a --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/version.c @@ -0,0 +1,3 @@ +#include "version.h" + +static const char libhugetlbfs_version[] = "VERSION: "VERSION; diff --git a/default/libhugetlbfs/libhugetlbfs/version.lds b/default/libhugetlbfs/libhugetlbfs/version.lds new file mode 100644 index 0000000..e76b8f7 --- /dev/null +++ b/default/libhugetlbfs/libhugetlbfs/version.lds @@ -0,0 +1,28 @@ +VERS_1.0 { + global: + gethugepagesize; + hugetlbfs_test_path; + hugetlbfs_find_path; + hugetlbfs_unlinked_fd; + local: + direct_syscall; + __lh_*; + __pu_*; +}; + +HTLBFS_2.0 { + global: + get_huge_pages; + free_huge_pages; +}; + +HTLBFS_2.1 { + global: + get_hugepage_region; + free_hugepage_region; + gethugepagesizes; + getpagesizes; + hugetlbfs_find_path_for_size; + hugetlbfs_unlinked_fd_for_size; + __tp_*; +}; diff --git a/default/libhugetlbfs/runtest.sh b/default/libhugetlbfs/runtest.sh new file mode 100755 index 0000000..2231145 --- /dev/null +++ b/default/libhugetlbfs/runtest.sh @@ -0,0 +1,49 @@ +#!/bin/sh + +source ../../utils/root-check.sh + +check_root +is_root=$? +if [ "$is_root" -ne "0" ]; then + exit 3 +fi + +# Build +cd libhugetlbfs +make BUILDTYPE=NATIVEONLY > /dev/null 2>/dev/null +if [ "$?" -ne "0" ]; then + echo "Could not build libhugetlbfs tests" + exit -1 +fi + +# Setup: Need at least 32 free hugepages for the shm test to run +obj/hugeadm --pool-pages-min 2MB:32 + +# Run +make BUILDTYPE=NATIVEONLY func > ../libhugetlbfs_results.tmp +if [ "$?" -ne "0" ]; then + echo "Could not run tests" + exit -1 +fi + +# look at stuff +# TODO +# Remove the readahead test. It seems it's known to always fail. +sed -e 's/readahead_reserve.sh .*//' ../libhugetlbfs_results.tmp > ../libhugetlbfs_results.tmp2 + +# Filter out the morecore tests too. Question sent into the list about those +sed -e 's/HUGETLB_MORECORE=.*//' ../libhugetlbfs_results.tmp2 > ../libhugetlbfs_results.txt + +grep -v "FAIL:" ../libhugetlbfs_results.txt | grep "FAIL" +if [ "$?" == "0" ]; then + echo "Test failures" + exit -1 +fi + +# Cleanup +obj/hugeadm --pool-pages-min 2MB:0 +obj/hugeadm --pool-pages-max 2MB:0 + +make clean > /dev/null 2> /dev/null +cd .. +rm -rf libhugeltbfs_results.* -- To stop receiving notification emails like this one, please contact the administrator of this repository. _______________________________________________ kernel mailing list kernel@xxxxxxxxxxxxxxxxxxxxxxx https://lists.fedoraproject.org/admin/lists/kernel@xxxxxxxxxxxxxxxxxxxxxxx