In jemalloc, a free(3) doesn't immediately free the memory to OS even the memory is page aligned/size, and hope the memory can be reused soon. Later the virtual address becomes fragmented, and more and more free memory are aggregated. If the free memory size is large, jemalloc uses madvise(DONT_NEED) to actually free the memory back to OS. The madvise has significantly overhead paritcularly because of TLB flush. jemalloc does madvise for several virtual address space ranges one time. Instead of calling madvise for each of the ranges, we introduce a new syscall to purge memory for several ranges one time. In this way, we can merge several TLB flush for the ranges to one big TLB flush. This also reduce mmap_sem locking. I'm running a simple memory allocation benchmark. 32 threads do random malloc/free/realloc. Corresponding jemalloc patch to utilize this API is attached. Without patch: real 0m18.923s user 1m11.819s sys 7m44.626s each cpu gets around 3000K/s TLB flush interrupt. Perf shows TLB flush is hotest functions. mmap_sem read locking (because of page fault) is also heavy. with patch: real 0m15.026s user 0m48.548s sys 6m41.153s each cpu gets around 140k/s TLB flush interrupt. TLB flush isn't hot at all. mmap_sem read locking (still because of page fault) becomes the sole hot spot. Another test malloc a bunch of memory in 48 threads, then all threads free the memory. I measure the time of the memory free. Without patch: 34.332s With patch: 17.429s Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: Rik van Riel <riel@xxxxxxxxxx> Cc: Mel Gorman <mgorman@xxxxxxx> Cc: Hugh Dickins <hughd@xxxxxxxxxx> Cc: Johannes Weiner <hannes@xxxxxxxxxxx> Cc: Andrea Arcangeli <aarcange@xxxxxxxxxx> Signed-off-by: Shaohua Li <shli@xxxxxx> --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + mm/madvise.c | 144 ++++++++++++++++++++++++++++++--- 3 files changed, 134 insertions(+), 12 deletions(-) diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 7663c45..4c99ef5 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -382,3 +382,4 @@ 373 i386 shutdown sys_shutdown 374 i386 userfaultfd sys_userfaultfd 375 i386 membarrier sys_membarrier +376 i386 madvisev sys_madvisev diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 278842f..1025406 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -331,6 +331,7 @@ 322 64 execveat stub_execveat 323 common userfaultfd sys_userfaultfd 324 common membarrier sys_membarrier +325 common madvisev sys_madvisev # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/mm/madvise.c b/mm/madvise.c index c889fcb..6251103 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -20,6 +20,9 @@ #include <linux/backing-dev.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/uio.h> +#include <linux/sort.h> +#include <asm/tlb.h> /* * Any behaviour which results in changes to the vma->vm_flags needs to @@ -415,6 +418,29 @@ madvise_behavior_valid(int behavior) } } +static bool madvise_range_valid(unsigned long start, size_t len_in, bool *skip) +{ + size_t len; + unsigned long end; + + if (start & ~PAGE_MASK) + return false; + len = (len_in + ~PAGE_MASK) & PAGE_MASK; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return false; + + end = start + len; + if (end < start) + return false; + if (end == start) + *skip = true; + else + *skip = false; + return true; +} + /* * The madvise(2) system call. * @@ -464,8 +490,8 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) int unmapped_error = 0; int error = -EINVAL; int write; - size_t len; struct blk_plug plug; + bool skip; #ifdef CONFIG_MEMORY_FAILURE if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) @@ -474,20 +500,12 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) if (!madvise_behavior_valid(behavior)) return error; - if (start & ~PAGE_MASK) - return error; - len = (len_in + ~PAGE_MASK) & PAGE_MASK; - - /* Check to see whether len was rounded up from small -ve to zero */ - if (len_in && !len) - return error; - - end = start + len; - if (end < start) + if (!madvise_range_valid(start, len_in, &skip)) return error; + end = start + ((len_in + ~PAGE_MASK) & PAGE_MASK); error = 0; - if (end == start) + if (skip) return error; write = madvise_need_mmap_write(behavior); @@ -549,3 +567,105 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) return error; } + +static int iov_cmp_func(const void *a, const void *b) +{ + const struct iovec *iova = a; + const struct iovec *iovb = b; + unsigned long addr_a = (unsigned long)iova->iov_base; + unsigned long addr_b = (unsigned long)iovb->iov_base; + + if (addr_a > addr_b) + return 1; + if (addr_a < addr_b) + return -1; + return 0; +} + +SYSCALL_DEFINE3(madvisev, const struct iovec __user *, uvector, unsigned long, nr_segs, + int, behavior) +{ + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = NULL; + struct vm_area_struct **vmas = NULL; + unsigned long start, last_start = 0; + size_t len; + struct mmu_gather tlb; + int error; + int i; + bool skip; + + if (behavior != MADV_DONTNEED) + return -EINVAL; + + error = rw_copy_check_uvector(CHECK_IOVEC_ONLY, uvector, nr_segs, + UIO_FASTIOV, iovstack, &iov); + if (error <= 0) + return error; + /* Make sure address in ascend order */ + sort(iov, nr_segs, sizeof(struct iovec), iov_cmp_func, NULL); + + vmas = kmalloc(nr_segs * sizeof(struct vm_area_struct *), GFP_KERNEL); + if (!vmas) { + error = -EFAULT; + goto out; + } + for (i = 0; i < nr_segs; i++) { + start = (unsigned long)iov[i].iov_base; + len = ((iov[i].iov_len + ~PAGE_MASK) & PAGE_MASK); + iov[i].iov_len = len; + if (start < last_start) { + error = -EINVAL; + goto out; + } + if (!madvise_range_valid(start, len, &skip)) { + error = -EINVAL; + goto out; + } + if (skip) { + error = 0; + goto out; + } + last_start = start + len; + } + + down_read(¤t->mm->mmap_sem); + for (i = 0; i < nr_segs; i++) { + start = (unsigned long)iov[i].iov_base; + len = iov[i].iov_len; + vmas[i] = find_vma(current->mm, start); + /* + * don't allow range cross vma, it doesn't make sense for + * DONTNEED + */ + if (!vmas[i] || start < vmas[i]->vm_start || + start + len > vmas[i]->vm_end) { + error = -ENOMEM; + goto up_out; + } + if (vmas[i]->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) { + error = -EINVAL; + goto up_out; + } + } + + lru_add_drain(); + tlb_gather_mmu(&tlb, current->mm, (unsigned long)iov[0].iov_base, + last_start); + update_hiwater_rss(current->mm); + for (i = 0; i < nr_segs; i++) { + start = (unsigned long)iov[i].iov_base; + len = iov[i].iov_len; + unmap_vmas(&tlb, vmas[i], start, start + len); + } + tlb_finish_mmu(&tlb, (unsigned long)iov[0].iov_base, last_start); + error = 0; + +up_out: + up_read(¤t->mm->mmap_sem); +out: + kfree(vmas); + if (iov != iovstack) + kfree(iov); + return error; +} -- 2.4.6
diff --git a/src/arena.c b/src/arena.c index 43733cc..ae2de35 100644 --- a/src/arena.c +++ b/src/arena.c @@ -1266,6 +1266,7 @@ arena_dirty_count(arena_t *arena) return (ndirty); } +#define PURGE_VEC 1 static size_t arena_compute_npurge(arena_t *arena, bool all) { @@ -1280,6 +1281,10 @@ arena_compute_npurge(arena_t *arena, bool all) threshold = threshold < chunk_npages ? chunk_npages : threshold; npurge = arena->ndirty - threshold; +#if PURGE_VEC + if (npurge < arena->ndirty / 2) + npurge = arena->ndirty / 2; +#endif } else npurge = arena->ndirty; @@ -1366,6 +1371,16 @@ arena_stash_dirty(arena_t *arena, chunk_hooks_t *chunk_hooks, bool all, return (nstashed); } +#if PURGE_VEC +#define MAX_IOVEC 32 +bool pages_purge_vec(struct iovec *iov, unsigned long nr_segs) +{ + int ret = syscall(325, iov, nr_segs, MADV_DONTNEED); + + return !!ret; +} +#endif + static size_t arena_purge_stashed(arena_t *arena, chunk_hooks_t *chunk_hooks, arena_runs_dirty_link_t *purge_runs_sentinel, @@ -1374,6 +1389,10 @@ arena_purge_stashed(arena_t *arena, chunk_hooks_t *chunk_hooks, size_t npurged, nmadvise; arena_runs_dirty_link_t *rdelm; extent_node_t *chunkselm; +#if PURGE_VEC + struct iovec iovec[MAX_IOVEC]; + int vec_index = 0; +#endif if (config_stats) nmadvise = 0; @@ -1418,9 +1437,21 @@ arena_purge_stashed(arena_t *arena, chunk_hooks_t *chunk_hooks, flag_unzeroed = 0; flags = CHUNK_MAP_DECOMMITTED; } else { +#if !PURGE_VEC flag_unzeroed = chunk_purge_wrapper(arena, chunk_hooks, chunk, chunksize, pageind << LG_PAGE, run_size) ? CHUNK_MAP_UNZEROED : 0; +#else + flag_unzeroed = 0; + iovec[vec_index].iov_base = (void *)((uintptr_t)chunk + + (pageind << LG_PAGE)); + iovec[vec_index].iov_len = run_size; + vec_index++; + if (vec_index >= MAX_IOVEC) { + pages_purge_vec(iovec, vec_index); + vec_index = 0; + } +#endif flags = flag_unzeroed; } arena_mapbits_large_set(chunk, pageind+npages-1, 0, @@ -1449,6 +1480,10 @@ arena_purge_stashed(arena_t *arena, chunk_hooks_t *chunk_hooks, if (config_stats) nmadvise++; } +#if PURGE_VEC + if (vec_index > 0) + pages_purge_vec(iovec, vec_index); +#endif malloc_mutex_lock(&arena->lock); if (config_stats) {