From: KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx> Tao Ma reported current mlock is much slower than old 2.6.18 kernel. Because lru_add_drain_all() spent much time. The problem are two. 1) lru_add_drain_all() broadcast a worker thread to all cpus unconditionally. then, the performance penalty is increased in proportion to number of cpus. 2) lru_add_drain_all() wait the worker finished unnecessary. It makes bigger penalty. This patch makes lru_add_drain_all_async() and changes mlock/mlockall use it. Technical side note: - has_pages_lru_pvecs() checks pagevecs locklessly. Of course, it's racy. But it's no matter because asynchronous worker itself is also racy. any lock can't close a race. - Now, we drain pagevec at last of mlock instead of beginning. because a page drain function (____pagevec_lru_add_fn) is PG_mlocked aware now. Then it's safe and it close more race. Without the patch: % time ./test_mlock -c 100000 real 1m13.608s user 0m0.204s sys 0m40.115s i.e. 200usec per mlock With the patch: % time ./test_mlock -c 100000 real 0m3.939s user 0m0.060s sys 0m3.868s i.e. 13usec per mlock test_mlock.c ========================================== #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <errno.h> #include <time.h> #include <sys/time.h> #include <sys/mman.h> #define MM_SZ1 24 #define MM_SZ2 56 #define MM_SZ3 4168 void mlock_test() { char ptr1[MM_SZ1]; char ptr2[MM_SZ2]; char ptr3[MM_SZ3]; if(0 != mlock(ptr1, MM_SZ1) ) perror("mlock MM_SZ1\n"); if(0 != mlock(ptr2, MM_SZ2) ) perror("mlock MM_SZ2\n"); if(0 != mlock(ptr3, MM_SZ3) ) perror("mlock MM_SZ3\n"); if(0 != munlock(ptr1, MM_SZ1) ) perror("munlock MM_SZ1\n"); if(0 != munlock(ptr2, MM_SZ2) ) perror("munlock MM_SZ2\n"); if(0 != munlock(ptr3, MM_SZ3) ) perror("munlock MM_SZ3\n"); } int main(int argc, char *argv[]) { int ret, opt; int i,cnt; while((opt = getopt(argc, argv, "c:")) != -1 ) { switch(opt){ case 'c': cnt = atoi(optarg); break; default: printf("Usage: %s [-c count] arg...\n", argv[0]); exit(EXIT_FAILURE); } } for(i = 0; i < cnt; i++) mlock_test(); return 0; } =========================================== Reported-by: Tao Ma <boyu.mt@xxxxxxxxxx> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx> Cc: David Rientjes <rientjes@xxxxxxxxxx> Cc: Minchan Kim <minchan.kim@xxxxxxxxx> Cc: KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx> Cc: Mel Gorman <mel@xxxxxxxxx> Cc: Johannes Weiner <jweiner@xxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- include/linux/swap.h | 1 + mm/mlock.c | 7 +---- mm/swap.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 63 insertions(+), 9 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 1e22e12..11ad301 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -223,6 +223,7 @@ extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); extern int lru_add_drain_all(void); +extern void lru_add_drain_all_async(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_page(struct page *page); extern void swap_setup(void); diff --git a/mm/mlock.c b/mm/mlock.c index 4f4f53b..08f5b6b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -487,8 +487,6 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) if (!can_do_mlock()) return -EPERM; - lru_add_drain_all(); /* flush pagevec */ - down_write(¤t->mm->mmap_sem); len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); start &= PAGE_MASK; @@ -505,6 +503,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) up_write(¤t->mm->mmap_sem); if (!error) error = do_mlock_pages(start, len, 0); + lru_add_drain_all_async(); return error; } @@ -557,9 +556,6 @@ SYSCALL_DEFINE1(mlockall, int, flags) if (!can_do_mlock()) goto out; - if (flags & MCL_CURRENT) - lru_add_drain_all(); /* flush pagevec */ - down_write(¤t->mm->mmap_sem); lock_limit = rlimit(RLIMIT_MEMLOCK); @@ -573,6 +569,7 @@ SYSCALL_DEFINE1(mlockall, int, flags) if (!ret && (flags & MCL_CURRENT)) { /* Ignore errors */ do_mlock_pages(0, TASK_SIZE, 1); + lru_add_drain_all_async(); } out: return ret; diff --git a/mm/swap.c b/mm/swap.c index a91caf7..2690f04 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -569,6 +569,49 @@ int lru_add_drain_all(void) return schedule_on_each_cpu(lru_add_drain_per_cpu); } +static DEFINE_PER_CPU(struct work_struct, lru_drain_work); + +static int __init lru_drain_work_init(void) +{ + struct work_struct *work; + int cpu; + + for_each_possible_cpu(cpu) { + work = &per_cpu(lru_drain_work, cpu); + INIT_WORK(work, &lru_add_drain_per_cpu); + } + + return 0; +} +core_initcall(lru_drain_work_init); + +static bool has_pages_lru_pvecs(int cpu) +{ + struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); + struct pagevec *pvec; + int lru; + + for_each_lru(lru) { + pvec = &pvecs[lru - LRU_BASE]; + if (pagevec_count(pvec)) + return true; + } + + return false; +} + +void lru_add_drain_all_async(void) +{ + int cpu; + + for_each_online_cpu(cpu) { + struct work_struct *work = &per_cpu(lru_drain_work, cpu); + + if (has_pages_lru_pvecs(cpu)) + schedule_work_on(cpu, work); + } +} + /* * Batched page_cache_release(). Decrement the reference count on all the * passed pages. If it fell to zero then remove the page from the LRU and @@ -704,10 +747,23 @@ static void ____pagevec_lru_add_fn(struct page *page, void *arg) VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - if (active) - SetPageActive(page); - update_page_reclaim_stat(zone, page, file, active); - add_page_to_lru_list(zone, page, lru); + redo: + if (page_evictable(page, NULL)) { + if (active) + SetPageActive(page); + update_page_reclaim_stat(zone, page, file, active); + add_page_to_lru_list(zone, page, lru); + } else { + SetPageUnevictable(page); + add_page_to_lru_list(zone, page, LRU_UNEVICTABLE); + smp_mb(); + + if (page_evictable(page, NULL)) { + del_page_from_lru_list(zone, page, LRU_UNEVICTABLE); + ClearPageUnevictable(page); + goto redo; + } + } } /* -- 1.7.1 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>