On Sat, Apr 16, 2011 at 8:23 AM, Ying Han <yinghan@xxxxxxxxxx> wrote: > Add the kswapd_mem field in kswapd descriptor which links the kswapd > kernel thread to a memcg. The per-memcg kswapd is sleeping in the wait > queue headed at kswapd_wait field of the kswapd descriptor. > > The kswapd() function is now shared between global and per-memcg kswapd. It > is passed in with the kswapd descriptor which contains the information of > either node or memcg. Then the new function balance_mem_cgroup_pgdat is > invoked if it is per-mem kswapd thread, and the implementation of the function > is on the following patch. > > changelog v4..v3: > 1. fix up the kswapd_run and kswapd_stop for online_pages() and offline_pages. > 2. drop the PF_MEMALLOC flag for memcg kswapd for now per KAMAZAWA's request. > > changelog v3..v2: > 1. split off from the initial patch which includes all changes of the following > three patches. > > Signed-off-by: Ying Han <yinghan@xxxxxxxxxx> > --- > Âinclude/linux/memcontrol.h |  Â5 ++ > Âinclude/linux/swap.h    |  Â5 +- > Âmm/memcontrol.c      Â|  29 ++++++++ > Âmm/memory_hotplug.c    Â|  Â4 +- > Âmm/vmscan.c        Â| Â156 ++++++++++++++++++++++++++++++-------------- > Â5 files changed, 147 insertions(+), 52 deletions(-) > > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h > index 3ece36d..f7ffd1f 100644 > --- a/include/linux/memcontrol.h > +++ b/include/linux/memcontrol.h > @@ -24,6 +24,7 @@ struct mem_cgroup; > Âstruct page_cgroup; > Âstruct page; > Âstruct mm_struct; > +struct kswapd; > > Â/* Stats that can be updated by kernel. */ > Âenum mem_cgroup_page_stat_item { > @@ -83,6 +84,10 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem); > Âextern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); > Âextern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); > Âextern int mem_cgroup_watermark_ok(struct mem_cgroup *mem, int charge_flags); > +extern int mem_cgroup_init_kswapd(struct mem_cgroup *mem, > +                 struct kswapd *kswapd_p); > +extern void mem_cgroup_clear_kswapd(struct mem_cgroup *mem); > +extern wait_queue_head_t *mem_cgroup_kswapd_wait(struct mem_cgroup *mem); > > Âstatic inline > Âint mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup) > diff --git a/include/linux/swap.h b/include/linux/swap.h > index f43d406..17e0511 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -30,6 +30,7 @@ struct kswapd { >    Âstruct task_struct *kswapd_task; >    Âwait_queue_head_t kswapd_wait; >    Âpg_data_t *kswapd_pgdat; > +    struct mem_cgroup *kswapd_mem; > Â}; > > Âint kswapd(void *p); > @@ -303,8 +304,8 @@ static inline void scan_unevictable_unregister_node(struct node *node) > Â} > Â#endif > > -extern int kswapd_run(int nid); > -extern void kswapd_stop(int nid); > +extern int kswapd_run(int nid, struct mem_cgroup *mem); > +extern void kswapd_stop(int nid, struct mem_cgroup *mem); > > Â#ifdef CONFIG_MMU > Â/* linux/mm/shmem.c */ > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 76ad009..8761a6f 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -278,6 +278,8 @@ struct mem_cgroup { >     */ >    Âu64 high_wmark_distance; >    Âu64 low_wmark_distance; > + > +    wait_queue_head_t *kswapd_wait; Like I mentioned in [1/10], personally, I like including kswapd instead of kswapd_wait. Just personal opinion. Feel free to ignore. > Â}; > > Â/* Stuffs for move charges at task migration. */ > @@ -4670,6 +4672,33 @@ int mem_cgroup_watermark_ok(struct mem_cgroup *mem, >    Âreturn ret; > Â} > > +int mem_cgroup_init_kswapd(struct mem_cgroup *mem, struct kswapd *kswapd_p) > +{ > +    if (!mem || !kswapd_p) > +        return 0; > + > +    mem->kswapd_wait = &kswapd_p->kswapd_wait; > +    kswapd_p->kswapd_mem = mem; > + > +    return css_id(&mem->css); > +} > + > +void mem_cgroup_clear_kswapd(struct mem_cgroup *mem) > +{ > +    if (mem) > +        mem->kswapd_wait = NULL; > + > +    return; > +} > + > +wait_queue_head_t *mem_cgroup_kswapd_wait(struct mem_cgroup *mem) > +{ > +    if (!mem) > +        return NULL; > + > +    return mem->kswapd_wait; > +} > + > Âstatic int mem_cgroup_soft_limit_tree_init(void) > Â{ >    Âstruct mem_cgroup_tree_per_node *rtpn; > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c > index 321fc74..2f78ff6 100644 > --- a/mm/memory_hotplug.c > +++ b/mm/memory_hotplug.c > @@ -462,7 +462,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) >    Âsetup_per_zone_wmarks(); >    Âcalculate_zone_inactive_ratio(zone); >    Âif (onlined_pages) { > -        kswapd_run(zone_to_nid(zone)); > +        kswapd_run(zone_to_nid(zone), NULL); >        Ânode_set_state(zone_to_nid(zone), N_HIGH_MEMORY); >    Â} > > @@ -897,7 +897,7 @@ repeat: >    Âcalculate_zone_inactive_ratio(zone); >    Âif (!node_present_pages(node)) { >        Ânode_clear_state(node, N_HIGH_MEMORY); > -        kswapd_stop(node); > +        kswapd_stop(node, NULL); >    Â} > >    Âvm_total_pages = nr_free_pagecache_pages(); > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 61fb96e..06036d2 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -2241,6 +2241,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, >    Âreturn balanced_pages > (present_pages >> 2); > Â} > > +#define is_node_kswapd(kswapd_p) (!(kswapd_p)->kswapd_mem) In memcg, we already have a similar thing "scanning_global_lru". How about using "global" term? > + > Â/* is kswapd sleeping prematurely? */ > Âstatic int sleeping_prematurely(struct kswapd *kswapd, int order, >                Âlong remaining, int classzone_idx) > @@ -2249,11 +2251,16 @@ static int sleeping_prematurely(struct kswapd *kswapd, int order, >    Âunsigned long balanced = 0; >    Âbool all_zones_ok = true; >    Âpg_data_t *pgdat = kswapd->kswapd_pgdat; > +    struct mem_cgroup *mem = kswapd->kswapd_mem; > >    Â/* If a direct reclaimer woke kswapd within HZ/10, it's premature */ >    Âif (remaining) >        Âreturn true; > > +    /* Doesn't support for per-memcg reclaim */ > +    if (mem) > +        return false; > + How about is_global_kswapd instead of checking wheterh mem field is NULL or not? >    Â/* Check the watermark levels */ >    Âfor (i = 0; i < pgdat->nr_zones; i++) { >        Âstruct zone *zone = pgdat->node_zones + i; > @@ -2596,19 +2603,25 @@ static void kswapd_try_to_sleep(struct kswapd *kswapd_p, int order, >     * go fully to sleep until explicitly woken up. >     */ >    Âif (!sleeping_prematurely(kswapd_p, order, remaining, classzone_idx)) { > -        trace_mm_vmscan_kswapd_sleep(pgdat->node_id); > +        if (is_node_kswapd(kswapd_p)) { > +            trace_mm_vmscan_kswapd_sleep(pgdat->node_id); > > -        /* > -        Â* vmstat counters are not perfectly accurate and the estimated > -        Â* value for counters such as NR_FREE_PAGES can deviate from the > -        Â* true value by nr_online_cpus * threshold. To avoid the zone > -        Â* watermarks being breached while under pressure, we reduce the > -        Â* per-cpu vmstat threshold while kswapd is awake and restore > -        Â* them before going back to sleep. > -        Â*/ > -        set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); > -        schedule(); > -        set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); > +            /* > +            Â* vmstat counters are not perfectly accurate and the > +            Â* estimated value for counters such as NR_FREE_PAGES > +            Â* can deviate from the true value by nr_online_cpus * > +            Â* threshold. To avoid the zone watermarks being > +            Â* breached while under pressure, we reduce the per-cpu > +            Â* vmstat threshold while kswapd is awake and restore > +            Â* them before going back to sleep. > +            Â*/ > +            set_pgdat_percpu_threshold(pgdat, > +                         Âcalculate_normal_threshold); > +            schedule(); > +            set_pgdat_percpu_threshold(pgdat, > +                        calculate_pressure_threshold); > +        } else > +            schedule(); >    Â} else { >        Âif (remaining) >            Âcount_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); > @@ -2618,6 +2631,12 @@ static void kswapd_try_to_sleep(struct kswapd *kswapd_p, int order, >    Âfinish_wait(wait_h, &wait); > Â} > > +static unsigned long balance_mem_cgroup_pgdat(struct mem_cgroup *mem_cont, > +                            int order) > +{ > +    return 0; > +} > + > Â/* > Â* The background pageout daemon, started as a kernel thread > Â* from the init process. > @@ -2637,6 +2656,7 @@ int kswapd(void *p) >    Âint classzone_idx; >    Âstruct kswapd *kswapd_p = (struct kswapd *)p; >    Âpg_data_t *pgdat = kswapd_p->kswapd_pgdat; > +    struct mem_cgroup *mem = kswapd_p->kswapd_mem; >    Âwait_queue_head_t *wait_h = &kswapd_p->kswapd_wait; >    Âstruct task_struct *tsk = current; > > @@ -2647,10 +2667,12 @@ int kswapd(void *p) > >    Âlockdep_set_current_reclaim_state(GFP_KERNEL); > > -    BUG_ON(pgdat->kswapd_wait != wait_h); > -    cpumask = cpumask_of_node(pgdat->node_id); > -    if (!cpumask_empty(cpumask)) > -        set_cpus_allowed_ptr(tsk, cpumask); > +    if (is_node_kswapd(kswapd_p)) { > +        BUG_ON(pgdat->kswapd_wait != wait_h); > +        cpumask = cpumask_of_node(pgdat->node_id); > +        if (!cpumask_empty(cpumask)) > +            set_cpus_allowed_ptr(tsk, cpumask); > +    } >    Âcurrent->reclaim_state = &reclaim_state; > >    Â/* > @@ -2665,7 +2687,10 @@ int kswapd(void *p) >     * us from recursively trying to free more memory as we're >     * trying to free the first piece of memory in the first place). >     */ > -    tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; > +    if (is_node_kswapd(kswapd_p)) > +        tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; > +    else > +        tsk->flags |= PF_SWAPWRITE | PF_KSWAPD; >    Âset_freezable(); > >    Âorder = 0; > @@ -2675,24 +2700,29 @@ int kswapd(void *p) >        Âint new_classzone_idx; >        Âint ret; > > -        new_order = pgdat->kswapd_max_order; > -        new_classzone_idx = pgdat->classzone_idx; > -        pgdat->kswapd_max_order = 0; > -        pgdat->classzone_idx = MAX_NR_ZONES - 1; > -        if (order < new_order || classzone_idx > new_classzone_idx) { > -            /* > -            Â* Don't sleep if someone wants a larger 'order' > -            Â* allocation or has tigher zone constraints > -            Â*/ > -            order = new_order; > -            classzone_idx = new_classzone_idx; > -        } else { > -            kswapd_try_to_sleep(kswapd_p, order, classzone_idx); > -            order = pgdat->kswapd_max_order; > -            classzone_idx = pgdat->classzone_idx; > +        if (is_node_kswapd(kswapd_p)) { > +            new_order = pgdat->kswapd_max_order; > +            new_classzone_idx = pgdat->classzone_idx; >            Âpgdat->kswapd_max_order = 0; >            Âpgdat->classzone_idx = MAX_NR_ZONES - 1; > -        } > +            if (order < new_order || > +                    classzone_idx > new_classzone_idx) { > +                /* > +                Â* Don't sleep if someone wants a larger 'order' > +                Â* allocation or has tigher zone constraints > +                Â*/ > +                order = new_order; > +                classzone_idx = new_classzone_idx; > +            } else { > +                kswapd_try_to_sleep(kswapd_p, order, > +                          classzone_idx); > +                order = pgdat->kswapd_max_order; > +                classzone_idx = pgdat->classzone_idx; > +                pgdat->kswapd_max_order = 0; > +                pgdat->classzone_idx = MAX_NR_ZONES - 1; > +            } > +        } else > +            kswapd_try_to_sleep(kswapd_p, order, classzone_idx); > >        Âret = try_to_freeze(); >        Âif (kthread_should_stop()) > @@ -2703,8 +2733,13 @@ int kswapd(void *p) >         * after returning from the refrigerator >         */ >        Âif (!ret) { > -            trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); > -            order = balance_pgdat(pgdat, order, &classzone_idx); > +            if (is_node_kswapd(kswapd_p)) { > +                trace_mm_vmscan_kswapd_wake(pgdat->node_id, > +                                order); > +                order = balance_pgdat(pgdat, order, > +                            &classzone_idx); > +            } else > +                balance_mem_cgroup_pgdat(mem, order); >        Â} >    Â} >    Âreturn 0; > @@ -2849,30 +2884,53 @@ static int __devinit cpu_callback(struct notifier_block *nfb, > Â* This kswapd start function will be called by init and node-hot-add. > Â* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. > Â*/ > -int kswapd_run(int nid) > +int kswapd_run(int nid, struct mem_cgroup *mem) > Â{ > -    pg_data_t *pgdat = NODE_DATA(nid); >    Âstruct task_struct *kswapd_thr; > +    pg_data_t *pgdat = NULL; >    Âstruct kswapd *kswapd_p; > +    static char name[TASK_COMM_LEN]; > +    int memcg_id; >    Âint ret = 0; > > -    if (pgdat->kswapd_wait) > -        return 0; > +    if (!mem) { > +        pgdat = NODE_DATA(nid); > +        if (pgdat->kswapd_wait) > +            return ret; > +    } > >    Âkswapd_p = kzalloc(sizeof(struct kswapd), GFP_KERNEL); >    Âif (!kswapd_p) >        Âreturn -ENOMEM; > >    Âinit_waitqueue_head(&kswapd_p->kswapd_wait); > -    pgdat->kswapd_wait = &kswapd_p->kswapd_wait; > -    kswapd_p->kswapd_pgdat = pgdat; > > -    kswapd_thr = kthread_run(kswapd, kswapd_p, "kswapd%d", nid); > +    if (!mem) { > +        pgdat->kswapd_wait = &kswapd_p->kswapd_wait; > +        kswapd_p->kswapd_pgdat = pgdat; > +        snprintf(name, TASK_COMM_LEN, "kswapd_%d", nid); > +    } else { > +        memcg_id = mem_cgroup_init_kswapd(mem, kswapd_p); > +        if (!memcg_id) { > +            kfree(kswapd_p); > +            return ret; > +        } > +        snprintf(name, TASK_COMM_LEN, "memcg_%d", memcg_id); > +    } > + > +    kswapd_thr = kthread_run(kswapd, kswapd_p, name); >    Âif (IS_ERR(kswapd_thr)) { >        Â/* failure at boot is fatal */ >        ÂBUG_ON(system_state == SYSTEM_BOOTING); > -        printk("Failed to start kswapd on node %d\n",nid); > -        pgdat->kswapd_wait = NULL; > +        if (!mem) { > +            printk(KERN_ERR "Failed to start kswapd on node %d\n", > +                                nid); > +            pgdat->kswapd_wait = NULL; > +        } else { > +            printk(KERN_ERR "Failed to start kswapd on memcg %d\n", > +                                memcg_id); > +            mem_cgroup_clear_kswapd(mem); > +        } >        Âkfree(kswapd_p); >        Âret = -1; >    Â} else > @@ -2883,15 +2941,17 @@ int kswapd_run(int nid) > Â/* > Â* Called by memory hotplug when all memory in a node is offlined. > Â*/ > -void kswapd_stop(int nid) > +void kswapd_stop(int nid, struct mem_cgroup *mem) > Â{ >    Âstruct task_struct *kswapd_thr = NULL; >    Âstruct kswapd *kswapd_p = NULL; >    Âwait_queue_head_t *wait; > > -    pg_data_t *pgdat = NODE_DATA(nid); > +    if (!mem) > +        wait = NODE_DATA(nid)->kswapd_wait; > +    else > +        wait = mem_cgroup_kswapd_wait(mem); > > -    wait = pgdat->kswapd_wait; >    Âif (wait) { >        Âkswapd_p = container_of(wait, struct kswapd, kswapd_wait); >        Âkswapd_thr = kswapd_p->kswapd_task; > @@ -2910,7 +2970,7 @@ static int __init kswapd_init(void) > >    Âswap_setup(); >    Âfor_each_node_state(nid, N_HIGH_MEMORY) > -        kswapd_run(nid); > +        kswapd_run(nid, NULL); >    Âhotcpu_notifier(cpu_callback, 0); >    Âreturn 0; > Â} > -- > 1.7.3.1 > > Let me ask a question. What's the effect of kswapd_try_to_sleep in memcg? As I look code, sleeping_prematurely always return false in case of memcg. So kswapd_try_to_sleep can sleep short time and then sleep until next wakeup. It means it doesn't have any related to kswapd_try_to_sleep's goal. So I hope you remove hack of memcg in kswapd_try_to_sleep and just calling schedule in kswapd function for sleeping memcg_kswapd. But I don't look at further patches in series so I may miss something. -- Kind regards, Minchan Kim ÿô.nÇ·ÿ±ég¬±¨Âaþé»®&Þ)î¦þ)íèh¨è&£ù¢¸ÿæ¢ú»þÇþm§ÿÿÃÿ)î¦þàbnö¥yÊ{^®wr«ë&§iÖ²('Ûÿÿìm éê¯Ãí¢ÿÚ·ÚýiÉ¢¸ÿý½§$þàÿ