* Ying Han <yinghan@xxxxxxxxxx> [2011-03-26 17:14:44]: > Two new stats in per-memcg memory.stat which tracks the number of > page faults and number of major page faults. > > "pgfault" > "pgmajfault" > > It is valuable to track the two stats for both measuring application's > performance as well as the efficiency of the kernel page reclaim path. > > Functional test: check the total number of pgfault/pgmajfault of all > memcgs and compare with global vmstat value: > > $ cat /proc/vmstat | grep fault > pgfault 1070751 > pgmajfault 553 > > $ cat /dev/cgroup/memory.stat | grep fault > pgfault 1069962 > pgmajfault 553 > total_pgfault 1069966 > total_pgmajfault 553 > > $ cat /dev/cgroup/A/memory.stat | grep fault > pgfault 199 > pgmajfault 0 > total_pgfault 199 > total_pgmajfault 0 > > Performance test: run page fault test(pft) wit 16 thread on faulting in 15G > anon pages in 16G container. There is no regression noticed on the "flt/cpu/s" > > Sample output from pft: > TAG pft:anon-sys-default: > Gb Thr CLine User System Wall flt/cpu/s fault/wsec > 15 16 1 0.67s 232.11s 14.68s 16892.130 267796.518 > > $ ./ministat mmotm.txt mmotm_fault.txt > x mmotm.txt (w/o patch) > + mmotm_fault.txt (w/ patch) > +-------------------------------------------------------------------------+ > N Min Max Median Avg Stddev > x 10 16682.962 17344.027 16913.524 16928.812 166.5362 > + 10 16696.49 17480.09 16949.143 16951.448 223.56288 > No difference proven at 95.0% confidence > > Signed-off-by: Ying Han <yinghan@xxxxxxxxxx> > --- > Documentation/cgroups/memory.txt | 4 +++ > fs/ncpfs/mmap.c | 2 + > include/linux/memcontrol.h | 22 +++++++++++++++ > mm/filemap.c | 1 + > mm/memcontrol.c | 54 ++++++++++++++++++++++++++++++++++++++ > mm/memory.c | 2 + > mm/shmem.c | 1 + > 7 files changed, 86 insertions(+), 0 deletions(-) > > diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt > index b6ed61c..2db6103 100644 > --- a/Documentation/cgroups/memory.txt > +++ b/Documentation/cgroups/memory.txt > @@ -385,6 +385,8 @@ mapped_file - # of bytes of mapped file (includes tmpfs/shmem) > pgpgin - # of pages paged in (equivalent to # of charging events). > pgpgout - # of pages paged out (equivalent to # of uncharging events). > swap - # of bytes of swap usage > +pgfault - # of page faults. > +pgmajfault - # of major page faults. > inactive_anon - # of bytes of anonymous memory and swap cache memory on > LRU list. > active_anon - # of bytes of anonymous and swap cache memory on active > @@ -406,6 +408,8 @@ total_mapped_file - sum of all children's "cache" > total_pgpgin - sum of all children's "pgpgin" > total_pgpgout - sum of all children's "pgpgout" > total_swap - sum of all children's "swap" > +total_pgfault - sum of all children's "pgfault" > +total_pgmajfault - sum of all children's "pgmajfault" > total_inactive_anon - sum of all children's "inactive_anon" > total_active_anon - sum of all children's "active_anon" > total_inactive_file - sum of all children's "inactive_file" > diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c > index a7c07b4..adb3f45 100644 > --- a/fs/ncpfs/mmap.c > +++ b/fs/ncpfs/mmap.c > @@ -16,6 +16,7 @@ > #include <linux/mman.h> > #include <linux/string.h> > #include <linux/fcntl.h> > +#include <linux/memcontrol.h> > > #include <asm/uaccess.h> > #include <asm/system.h> > @@ -92,6 +93,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area, > * -- wli > */ > count_vm_event(PGMAJFAULT); > + mem_cgroup_pgmajfault_from_mm(area->vm_mm); > return VM_FAULT_MAJOR; > } > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 5a5ce70..f771fc1 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -147,6 +147,11 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, > gfp_t gfp_mask); > u64 mem_cgroup_get_limit(struct mem_cgroup *mem); > > +void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val); > +void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val); > +void mem_cgroup_pgfault_from_mm(struct mm_struct *mm); > +void mem_cgroup_pgmajfault_from_mm(struct mm_struct *mm); > + > #ifdef CONFIG_TRANSPARENT_HUGEPAGE > void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail); > #endif > @@ -354,6 +359,23 @@ static inline void mem_cgroup_split_huge_fixup(struct page *head, > { > } > > +static inline void mem_cgroup_pgfault(struct mem_cgroup *memcg, > + int val) > +{ > +} > + > +static inline void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, > + int val) > +{ > +} > + > +static inline void mem_cgroup_pgfault_from_mm(struct mm_struct *mm) > +{ > +} > + > +static inline void mem_cgroup_pgmajfault_from_mm(struct mm_struct *mm) > +{ > +} > #endif /* CONFIG_CGROUP_MEM_CONT */ > > #if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) > diff --git a/mm/filemap.c b/mm/filemap.c > index a6cfecf..5dc5401 100644 > --- a/mm/filemap.c > +++ b/mm/filemap.c > @@ -1683,6 +1683,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) > /* No page in the page cache at all */ > do_sync_mmap_readahead(vma, ra, file, offset); > count_vm_event(PGMAJFAULT); > + mem_cgroup_pgmajfault_from_mm(vma->vm_mm); > ret = VM_FAULT_MAJOR; > retry_find: > page = find_get_page(mapping, offset); > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 4407dd0..63d66f1 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -94,6 +94,8 @@ enum mem_cgroup_events_index { > MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ > MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ > MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */ > + MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ > + MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ > MEM_CGROUP_EVENTS_NSTATS, > }; > /* > @@ -585,6 +587,16 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, > this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); > } > > +void mem_cgroup_pgfault(struct mem_cgroup *mem, int val) > +{ > + this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); > +} > + > +void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val) > +{ > + this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); > +} > + > static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, > enum mem_cgroup_events_index idx) > { > @@ -813,6 +825,40 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) > return (mem == root_mem_cgroup); > } > > +void mem_cgroup_pgfault_from_mm(struct mm_struct *mm) > +{ > + struct mem_cgroup *mem; > + > + if (!mm) > + return; > + > + rcu_read_lock(); > + mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); > + if (unlikely(!mem)) > + goto out; A lot of this can be reused, just a minor nitpick. May be you can combine this function and the one below > + mem_cgroup_pgfault(mem, 1); > + > +out: > + rcu_read_unlock(); > +} > + > +void mem_cgroup_pgmajfault_from_mm(struct mm_struct *mm) > +{ > + struct mem_cgroup *mem; > + > + if (!mm) > + return; > + > + rcu_read_lock(); > + mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); > + if (unlikely(!mem)) > + goto out; > + mem_cgroup_pgmajfault(mem, 1); > +out: > + rcu_read_unlock(); > +} > +EXPORT_SYMBOL(mem_cgroup_pgmajfault_from_mm); > + > /* > * Following LRU functions are allowed to be used without PCG_LOCK. > * Operations are called by routine of global LRU independently from memcg. > @@ -3772,6 +3818,8 @@ enum { > MCS_PGPGIN, > MCS_PGPGOUT, > MCS_SWAP, > + MCS_PGFAULT, > + MCS_PGMAJFAULT, > MCS_INACTIVE_ANON, > MCS_ACTIVE_ANON, > MCS_INACTIVE_FILE, > @@ -3794,6 +3842,8 @@ struct { > {"pgpgin", "total_pgpgin"}, > {"pgpgout", "total_pgpgout"}, > {"swap", "total_swap"}, > + {"pgfault", "total_pgfault"}, > + {"pgmajfault", "total_pgmajfault"}, > {"inactive_anon", "total_inactive_anon"}, > {"active_anon", "total_active_anon"}, > {"inactive_file", "total_inactive_file"}, > @@ -3822,6 +3872,10 @@ mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) > val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); > s->stat[MCS_SWAP] += val * PAGE_SIZE; > } > + val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT); > + s->stat[MCS_PGFAULT] += val; > + val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT); > + s->stat[MCS_PGMAJFAULT] += val; > > /* per zone stat */ > val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); > diff --git a/mm/memory.c b/mm/memory.c > index 8617d39..0f7ebc9 100644 > --- a/mm/memory.c > +++ b/mm/memory.c > @@ -2836,6 +2836,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, > /* Had to read the page from swap area: Major fault */ > ret = VM_FAULT_MAJOR; > count_vm_event(PGMAJFAULT); > + mem_cgroup_pgmajfault_from_mm(mm); > } else if (PageHWPoison(page)) { > /* > * hwpoisoned dirty swapcache pages are kept for killing > @@ -3375,6 +3376,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, > __set_current_state(TASK_RUNNING); > > count_vm_event(PGFAULT); > + mem_cgroup_pgfault_from_mm(mm); > > /* do counter updates before entering really critical section. */ > check_sync_rss_stat(current); > diff --git a/mm/shmem.c b/mm/shmem.c > index ad8346b..5a82674 100644 > --- a/mm/shmem.c > +++ b/mm/shmem.c > @@ -1289,6 +1289,7 @@ repeat: > /* here we actually do the io */ > if (type && !(*type & VM_FAULT_MAJOR)) { > __count_vm_event(PGMAJFAULT); > + mem_cgroup_pgmajfault_from_mm(current->mm); > *type |= VM_FAULT_MAJOR; > } > spin_unlock(&info->lock); > -- > 1.7.3.1 > -- Three Cheers, Balbir -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxxx For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>