Andrea Righi wrote: > Dirty pages in the page cache can be processed asynchronously by kernel > threads (pdflush) using a writeback policy. For this reason the real > writes to the underlying block devices occur in a different IO context > respect to the task that originally generated the dirty pages involved > in the IO operation. This makes the tracking and throttling of writeback > IO more complicate respect to the synchronous IO. > > The page_cgroup infrastructure, currently available only for the memory > cgroup controller, can be used to store the owner of each page and > opportunely track the writeback IO. This information is encoded in > page_cgroup->flags. You encode id in page_cgroup->flags, if a cgroup get removed, IMHO, you should remove the corresponding id in flags. One more thing, if a task is moving from a cgroup to another, the id in flags also need to be changed. > > A owner can be identified using a generic ID number and the following > interfaces are provided to store a retrieve this information: > > unsigned long page_cgroup_get_owner(struct page *page); > int page_cgroup_set_owner(struct page *page, unsigned long id); > int page_cgroup_copy_owner(struct page *npage, struct page *opage); > > The io-throttle controller uses the cgroup css_id() as the owner's ID > number. > > A big part of this code is taken from the Ryo and Hirokazu's bio-cgroup > controller (http://people.valinux.co.jp/~ryov/bio-cgroup/). > > Signed-off-by: Andrea Righi <righi.andrea@xxxxxxxxx> > Signed-off-by: Hirokazu Takahashi <taka@xxxxxxxxxxxxx> > Signed-off-by: Ryo Tsuruta <ryov@xxxxxxxxxxxxx> > --- > include/linux/memcontrol.h | 6 +++ > include/linux/mmzone.h | 4 +- > include/linux/page_cgroup.h | 33 +++++++++++++- > init/Kconfig | 4 ++ > mm/Makefile | 3 +- > mm/memcontrol.c | 6 +++ > mm/page_cgroup.c | 95 ++++++++++++++++++++++++++++++++++++++----- > 7 files changed, 135 insertions(+), 16 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 18146c9..f3e0e64 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -37,6 +37,8 @@ struct mm_struct; > * (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.) > */ > > +extern void __init_mem_page_cgroup(struct page_cgroup *pc); > + > extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm, > gfp_t gfp_mask); > /* for swap handling */ > @@ -120,6 +122,10 @@ extern bool mem_cgroup_oom_called(struct task_struct *task); > #else /* CONFIG_CGROUP_MEM_RES_CTLR */ > struct mem_cgroup; > > +static inline void __init_mem_page_cgroup(struct page_cgroup *pc) > +{ > +} > + > static inline int mem_cgroup_newpage_charge(struct page *page, > struct mm_struct *mm, gfp_t gfp_mask) > { > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 186ec6a..b178eb9 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -607,7 +607,7 @@ typedef struct pglist_data { > int nr_zones; > #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ > struct page *node_mem_map; > -#ifdef CONFIG_CGROUP_MEM_RES_CTLR > +#ifdef CONFIG_PAGE_TRACKING > struct page_cgroup *node_page_cgroup; > #endif > #endif > @@ -958,7 +958,7 @@ struct mem_section { > > /* See declaration of similar field in struct zone */ > unsigned long *pageblock_flags; > -#ifdef CONFIG_CGROUP_MEM_RES_CTLR > +#ifdef CONFIG_PAGE_TRACKING > /* > * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use > * section. (see memcontrol.h/page_cgroup.h about this.) > diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h > index 7339c7b..f24d081 100644 > --- a/include/linux/page_cgroup.h > +++ b/include/linux/page_cgroup.h > @@ -1,7 +1,7 @@ > #ifndef __LINUX_PAGE_CGROUP_H > #define __LINUX_PAGE_CGROUP_H > > -#ifdef CONFIG_CGROUP_MEM_RES_CTLR > +#ifdef CONFIG_PAGE_TRACKING > #include <linux/bit_spinlock.h> > /* > * Page Cgroup can be considered as an extended mem_map. > @@ -12,11 +12,38 @@ > */ > struct page_cgroup { > unsigned long flags; > - struct mem_cgroup *mem_cgroup; > struct page *page; > +#ifdef CONFIG_CGROUP_MEM_RES_CTLR > + struct mem_cgroup *mem_cgroup; > struct list_head lru; /* per cgroup LRU list */ > +#endif > }; > > +/* > + * use lower 16 bits for flags and reserve the rest for the page tracking id > + */ > +#define PAGE_TRACKING_ID_SHIFT (16) > +#define PAGE_TRACKING_ID_BITS \ > + (8 * sizeof(unsigned long) - PAGE_TRACKING_ID_SHIFT) > + > +/* NOTE: must be called with page_cgroup() held */ > +static inline unsigned long page_cgroup_get_id(struct page_cgroup *pc) > +{ > + return pc->flags >> PAGE_TRACKING_ID_SHIFT; > +} > + > +/* NOTE: must be called with page_cgroup() held */ > +static inline void page_cgroup_set_id(struct page_cgroup *pc, unsigned long id) > +{ > + WARN_ON(id >= (1UL << PAGE_TRACKING_ID_BITS)); > + pc->flags &= (1UL << PAGE_TRACKING_ID_SHIFT) - 1; > + pc->flags |= (unsigned long)(id << PAGE_TRACKING_ID_SHIFT); > +} > + > +unsigned long page_cgroup_get_owner(struct page *page); > +int page_cgroup_set_owner(struct page *page, unsigned long id); > +int page_cgroup_copy_owner(struct page *npage, struct page *opage); > + > void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat); > void __init page_cgroup_init(void); > struct page_cgroup *lookup_page_cgroup(struct page *page); > @@ -71,7 +98,7 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc) > bit_spin_unlock(PCG_LOCK, &pc->flags); > } > > -#else /* CONFIG_CGROUP_MEM_RES_CTLR */ > +#else /* CONFIG_PAGE_TRACKING */ > struct page_cgroup; > > static inline void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) > diff --git a/init/Kconfig b/init/Kconfig > index 7be4d38..5428ac7 100644 > --- a/init/Kconfig > +++ b/init/Kconfig > @@ -569,6 +569,7 @@ config CGROUP_MEM_RES_CTLR > bool "Memory Resource Controller for Control Groups" > depends on CGROUPS && RESOURCE_COUNTERS > select MM_OWNER > + select PAGE_TRACKING > help > Provides a memory resource controller that manages both anonymous > memory and page cache. (See Documentation/cgroups/memory.txt) > @@ -611,6 +612,9 @@ endif # CGROUPS > config MM_OWNER > bool > > +config PAGE_TRACKING > + bool > + > config SYSFS_DEPRECATED > bool > > diff --git a/mm/Makefile b/mm/Makefile > index ec73c68..b94e074 100644 > --- a/mm/Makefile > +++ b/mm/Makefile > @@ -37,4 +37,5 @@ else > obj-$(CONFIG_SMP) += allocpercpu.o > endif > obj-$(CONFIG_QUICKLIST) += quicklist.o > -obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o > +obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o > +obj-$(CONFIG_PAGE_TRACKING) += page_cgroup.o > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index e44fb0f..69d1c31 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -2524,6 +2524,12 @@ struct cgroup_subsys mem_cgroup_subsys = { > .use_id = 1, > }; > > +void __meminit __init_mem_page_cgroup(struct page_cgroup *pc) > +{ > + pc->mem_cgroup = NULL; > + INIT_LIST_HEAD(&pc->lru); > +} > + > #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP > > static int __init disable_swap_account(char *s) > diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c > index 791905c..b3b394c 100644 > --- a/mm/page_cgroup.c > +++ b/mm/page_cgroup.c > @@ -3,6 +3,7 @@ > #include <linux/bootmem.h> > #include <linux/bit_spinlock.h> > #include <linux/page_cgroup.h> > +#include <linux/blk-io-throttle.h> > #include <linux/hash.h> > #include <linux/slab.h> > #include <linux/memory.h> > @@ -14,9 +15,8 @@ static void __meminit > __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) > { > pc->flags = 0; > - pc->mem_cgroup = NULL; > pc->page = pfn_to_page(pfn); > - INIT_LIST_HEAD(&pc->lru); > + __init_mem_page_cgroup(pc); > } > static unsigned long total_usage; > > @@ -74,7 +74,7 @@ void __init page_cgroup_init(void) > > int nid, fail; > > - if (mem_cgroup_disabled()) > + if (mem_cgroup_disabled() && iothrottle_disabled()) > return; > > for_each_online_node(nid) { > @@ -83,12 +83,13 @@ void __init page_cgroup_init(void) > goto fail; > } > printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); > - printk(KERN_INFO "please try cgroup_disable=memory option if you" > - " don't want\n"); > + printk(KERN_INFO > + "try cgroup_disable=memory,blockio option if you don't want\n"); > return; > fail: > printk(KERN_CRIT "allocation of page_cgroup was failed.\n"); > - printk(KERN_CRIT "please try cgroup_disable=memory boot option\n"); > + printk(KERN_CRIT > + "try cgroup_disable=memory,blockio boot option\n"); > panic("Out of memory"); > } > > @@ -243,12 +244,85 @@ static int __meminit page_cgroup_callback(struct notifier_block *self, > > #endif > > +/** > + * page_cgroup_get_owner() - get the owner ID of a page > + * @page: the page we want to find the owner > + * > + * Returns the owner ID of the page, 0 means that the owner cannot be > + * retrieved. > + **/ > +unsigned long page_cgroup_get_owner(struct page *page) > +{ > + struct page_cgroup *pc; > + unsigned long ret; > + > + pc = lookup_page_cgroup(page); > + if (unlikely(!pc)) > + return 0; > + > + lock_page_cgroup(pc); > + ret = page_cgroup_get_id(pc); > + unlock_page_cgroup(pc); > + return ret; > +} > + > +/** > + * page_cgroup_set_owner() - set the owner ID of a page > + * @page: the page we want to tag > + * @id: the ID number that will be associated to page > + * > + * Returns 0 if the owner is correctly associated to the page. Returns a > + * negative value in case of failure. > + **/ > +int page_cgroup_set_owner(struct page *page, unsigned long id) > +{ > + struct page_cgroup *pc; > + > + pc = lookup_page_cgroup(page); > + if (unlikely(!pc)) > + return -ENOENT; > + > + lock_page_cgroup(pc); > + page_cgroup_set_id(pc, id); > + unlock_page_cgroup(pc); > + return 0; > +} > + > +/** > + * page_cgroup_copy_owner() - copy the owner ID of a page into another page > + * @npage: the page where we want to copy the owner > + * @opage: the page from which we want to copy the ID > + * > + * Returns 0 if the owner is correctly associated to npage. Returns a negative > + * value in case of failure. > + **/ > +int page_cgroup_copy_owner(struct page *npage, struct page *opage) > +{ > + struct page_cgroup *npc, *opc; > + unsigned long id; > + > + npc = lookup_page_cgroup(npage); > + if (unlikely(!npc)) > + return -ENOENT; > + opc = lookup_page_cgroup(opage); > + if (unlikely(!opc)) > + return -ENOENT; > + lock_page_cgroup(opc); > + lock_page_cgroup(npc); > + id = page_cgroup_get_id(opc); > + page_cgroup_set_id(npc, id); > + unlock_page_cgroup(npc); > + unlock_page_cgroup(opc); > + > + return 0; > +} > + > void __init page_cgroup_init(void) > { > unsigned long pfn; > int fail = 0; > > - if (mem_cgroup_disabled()) > + if (mem_cgroup_disabled() && iothrottle_disabled()) > return; > > for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { > @@ -257,14 +331,15 @@ void __init page_cgroup_init(void) > fail = init_section_page_cgroup(pfn); > } > if (fail) { > - printk(KERN_CRIT "try cgroup_disable=memory boot option\n"); > + printk(KERN_CRIT > + "try cgroup_disable=memory,blockio boot option\n"); > panic("Out of memory"); > } else { > hotplug_memory_notifier(page_cgroup_callback, 0); > } > printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); > - printk(KERN_INFO "please try cgroup_disable=memory option if you don't" > - " want\n"); > + printk(KERN_INFO > + "try cgroup_disable=memory,blockio option if you don't want\n"); > } > > void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) -- Regards Gui Jianfeng _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linux-foundation.org/mailman/listinfo/containers