On Tue, Mar 29, 2011 at 8:30 AM, Balbir Singh <balbir@xxxxxxxxxxxxxxxxxx> wrote: > * Ying Han <yinghan@xxxxxxxxxx> [2011-03-26 17:14:44]: > >> Two new stats in per-memcg memory.stat which tracks the number of >> page faults and number of major page faults. >> >> "pgfault" >> "pgmajfault" >> >> It is valuable to track the two stats for both measuring application's >> performance as well as the efficiency of the kernel page reclaim path. >> >> Functional test: check the total number of pgfault/pgmajfault of all >> memcgs and compare with global vmstat value: >> >> $ cat /proc/vmstat | grep fault >> pgfault 1070751 >> pgmajfault 553 >> >> $ cat /dev/cgroup/memory.stat | grep fault >> pgfault 1069962 >> pgmajfault 553 >> total_pgfault 1069966 >> total_pgmajfault 553 >> >> $ cat /dev/cgroup/A/memory.stat | grep fault >> pgfault 199 >> pgmajfault 0 >> total_pgfault 199 >> total_pgmajfault 0 >> >> Performance test: run page fault test(pft) wit 16 thread on faulting in 15G >> anon pages in 16G container. There is no regression noticed on the "flt/cpu/s" >> >> Sample output from pft: >> TAG pft:anon-sys-default: >> Gb Thr CLine User System Wall flt/cpu/s fault/wsec >> 15 16 1 0.67s 232.11s 14.68s 16892.130 267796.518 >> >> $ ./ministat mmotm.txt mmotm_fault.txt >> x mmotm.txt (w/o patch) >> + mmotm_fault.txt (w/ patch) >> +-------------------------------------------------------------------------+ >> N Min Max Median Avg Stddev >> x 10 16682.962 17344.027 16913.524 16928.812 166.5362 >> + 10 16696.49 17480.09 16949.143 16951.448 223.56288 >> No difference proven at 95.0% confidence >> >> Signed-off-by: Ying Han <yinghan@xxxxxxxxxx> >> --- >> Documentation/cgroups/memory.txt | 4 +++ >> fs/ncpfs/mmap.c | 2 + >> include/linux/memcontrol.h | 22 +++++++++++++++ >> mm/filemap.c | 1 + >> mm/memcontrol.c | 54 ++++++++++++++++++++++++++++++++++++++ >> mm/memory.c | 2 + >> mm/shmem.c | 1 + >> 7 files changed, 86 insertions(+), 0 deletions(-) >> >> diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt >> index b6ed61c..2db6103 100644 >> --- a/Documentation/cgroups/memory.txt >> +++ b/Documentation/cgroups/memory.txt >> @@ -385,6 +385,8 @@ mapped_file - # of bytes of mapped file (includes tmpfs/shmem) >> pgpgin - # of pages paged in (equivalent to # of charging events). >> pgpgout - # of pages paged out (equivalent to # of uncharging events). >> swap - # of bytes of swap usage >> +pgfault - # of page faults. >> +pgmajfault - # of major page faults. >> inactive_anon - # of bytes of anonymous memory and swap cache memory on >> LRU list. >> active_anon - # of bytes of anonymous and swap cache memory on active >> @@ -406,6 +408,8 @@ total_mapped_file - sum of all children's "cache" >> total_pgpgin - sum of all children's "pgpgin" >> total_pgpgout - sum of all children's "pgpgout" >> total_swap - sum of all children's "swap" >> +total_pgfault - sum of all children's "pgfault" >> +total_pgmajfault - sum of all children's "pgmajfault" >> total_inactive_anon - sum of all children's "inactive_anon" >> total_active_anon - sum of all children's "active_anon" >> total_inactive_file - sum of all children's "inactive_file" >> diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c >> index a7c07b4..adb3f45 100644 >> --- a/fs/ncpfs/mmap.c >> +++ b/fs/ncpfs/mmap.c >> @@ -16,6 +16,7 @@ >> #include <linux/mman.h> >> #include <linux/string.h> >> #include <linux/fcntl.h> >> +#include <linux/memcontrol.h> >> >> #include <asm/uaccess.h> >> #include <asm/system.h> >> @@ -92,6 +93,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area, >> * -- wli >> */ >> count_vm_event(PGMAJFAULT); >> + mem_cgroup_pgmajfault_from_mm(area->vm_mm); >> return VM_FAULT_MAJOR; >> } >> >> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h >> index 5a5ce70..f771fc1 100644 >> --- a/include/linux/memcontrol.h >> +++ b/include/linux/memcontrol.h >> @@ -147,6 +147,11 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, >> gfp_t gfp_mask); >> u64 mem_cgroup_get_limit(struct mem_cgroup *mem); >> >> +void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val); >> +void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val); >> +void mem_cgroup_pgfault_from_mm(struct mm_struct *mm); >> +void mem_cgroup_pgmajfault_from_mm(struct mm_struct *mm); >> + >> #ifdef CONFIG_TRANSPARENT_HUGEPAGE >> void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail); >> #endif >> @@ -354,6 +359,23 @@ static inline void mem_cgroup_split_huge_fixup(struct page *head, >> { >> } >> >> +static inline void mem_cgroup_pgfault(struct mem_cgroup *memcg, >> + int val) >> +{ >> +} >> + >> +static inline void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, >> + int val) >> +{ >> +} >> + >> +static inline void mem_cgroup_pgfault_from_mm(struct mm_struct *mm) >> +{ >> +} >> + >> +static inline void mem_cgroup_pgmajfault_from_mm(struct mm_struct *mm) >> +{ >> +} >> #endif /* CONFIG_CGROUP_MEM_CONT */ >> >> #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) >> diff --git a/mm/filemap.c b/mm/filemap.c >> index a6cfecf..5dc5401 100644 >> --- a/mm/filemap.c >> +++ b/mm/filemap.c >> @@ -1683,6 +1683,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) >> /* No page in the page cache at all */ >> do_sync_mmap_readahead(vma, ra, file, offset); >> count_vm_event(PGMAJFAULT); >> + mem_cgroup_pgmajfault_from_mm(vma->vm_mm); >> ret = VM_FAULT_MAJOR; >> retry_find: >> page = find_get_page(mapping, offset); >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c >> index 4407dd0..63d66f1 100644 >> --- a/mm/memcontrol.c >> +++ b/mm/memcontrol.c >> @@ -94,6 +94,8 @@ enum mem_cgroup_events_index { >> MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ >> MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ >> MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ >> + MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ >> + MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ >> MEM_CGROUP_EVENTS_NSTATS, >> }; >> /* >> @@ -585,6 +587,16 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, >> this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); >> } >> >> +void mem_cgroup_pgfault(struct mem_cgroup *mem, int val) >> +{ >> + this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); >> +} >> + >> +void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val) >> +{ >> + this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); >> +} >> + >> static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, >> enum mem_cgroup_events_index idx) >> { >> @@ -813,6 +825,40 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) >> return (mem == root_mem_cgroup); >> } >> >> +void mem_cgroup_pgfault_from_mm(struct mm_struct *mm) >> +{ >> + struct mem_cgroup *mem; >> + >> + if (!mm) >> + return; >> + >> + rcu_read_lock(); >> + mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); >> + if (unlikely(!mem)) >> + goto out; > > A lot of this can be reused, just a minor nitpick. May be you can > combine this function and the one below This has been fixed in V3 :) --Ying > >> + mem_cgroup_pgfault(mem, 1); >> + >> +out: >> + rcu_read_unlock(); >> +} >> + >> +void mem_cgroup_pgmajfault_from_mm(struct mm_struct *mm) >> +{ >> + struct mem_cgroup *mem; >> + >> + if (!mm) >> + return; >> + >> + rcu_read_lock(); >> + mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); >> + if (unlikely(!mem)) >> + goto out; >> + mem_cgroup_pgmajfault(mem, 1); >> +out: >> + rcu_read_unlock(); >> +} >> +EXPORT_SYMBOL(mem_cgroup_pgmajfault_from_mm); >> + >> /* >> * Following LRU functions are allowed to be used without PCG_LOCK. >> * Operations are called by routine of global LRU independently from memcg. >> @@ -3772,6 +3818,8 @@ enum { >> MCS_PGPGIN, >> MCS_PGPGOUT, >> MCS_SWAP, >> + MCS_PGFAULT, >> + MCS_PGMAJFAULT, >> MCS_INACTIVE_ANON, >> MCS_ACTIVE_ANON, >> MCS_INACTIVE_FILE, >> @@ -3794,6 +3842,8 @@ struct { >> {"pgpgin", "total_pgpgin"}, >> {"pgpgout", "total_pgpgout"}, >> {"swap", "total_swap"}, >> + {"pgfault", "total_pgfault"}, >> + {"pgmajfault", "total_pgmajfault"}, >> {"inactive_anon", "total_inactive_anon"}, >> {"active_anon", "total_active_anon"}, >> {"inactive_file", "total_inactive_file"}, >> @@ -3822,6 +3872,10 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) >> val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); >> s->stat[MCS_SWAP] += val * PAGE_SIZE; >> } >> + val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT); >> + s->stat[MCS_PGFAULT] += val; >> + val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT); >> + s->stat[MCS_PGMAJFAULT] += val; >> >> /* per zone stat */ >> val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); >> diff --git a/mm/memory.c b/mm/memory.c >> index 8617d39..0f7ebc9 100644 >> --- a/mm/memory.c >> +++ b/mm/memory.c >> @@ -2836,6 +2836,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, >> /* Had to read the page from swap area: Major fault */ >> ret = VM_FAULT_MAJOR; >> count_vm_event(PGMAJFAULT); >> + mem_cgroup_pgmajfault_from_mm(mm); >> } else if (PageHWPoison(page)) { >> /* >> * hwpoisoned dirty swapcache pages are kept for killing >> @@ -3375,6 +3376,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, >> __set_current_state(TASK_RUNNING); >> >> count_vm_event(PGFAULT); >> + mem_cgroup_pgfault_from_mm(mm); >> >> /* do counter updates before entering really critical section. */ >> check_sync_rss_stat(current); >> diff --git a/mm/shmem.c b/mm/shmem.c >> index ad8346b..5a82674 100644 >> --- a/mm/shmem.c >> +++ b/mm/shmem.c >> @@ -1289,6 +1289,7 @@ repeat: >> /* here we actually do the io */ >> if (type && !(*type & VM_FAULT_MAJOR)) { >> __count_vm_event(PGMAJFAULT); >> + mem_cgroup_pgmajfault_from_mm(current->mm); >> *type |= VM_FAULT_MAJOR; >> } >> spin_unlock(&info->lock); >> -- >> 1.7.3.1 >> > > -- > Three Cheers, > Balbir > -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href