Hi Ying, 2011/4/13 Ying Han <yinghan@xxxxxxxxxx>: > -extern int kswapd_run(int nid); > -extern void kswapd_stop(int nid); > +extern int kswapd_run(int nid, struct mem_cgroup *mem); > +extern void kswapd_stop(int nid, struct mem_cgroup *mem); This breaks online_pages() and offline_pages(), which are also the caller of kswaped_run() and kswaped_stop(). Thanks, Zhu Yanhai > > Â#ifdef CONFIG_MMU > Â/* linux/mm/shmem.c */ > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 36ae377..acd84a8 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -274,6 +274,8 @@ struct mem_cgroup { >    Âspinlock_t pcp_counter_lock; > >    Âint wmark_ratio; > + > +    wait_queue_head_t *kswapd_wait; > Â}; > > Â/* Stuffs for move charges at task migration. */ > @@ -4622,6 +4624,33 @@ int mem_cgroup_watermark_ok(struct mem_cgroup *mem, >    Âreturn ret; > Â} > > +int mem_cgroup_init_kswapd(struct mem_cgroup *mem, struct kswapd *kswapd_p) > +{ > +    if (!mem || !kswapd_p) > +        return 0; > + > +    mem->kswapd_wait = &kswapd_p->kswapd_wait; > +    kswapd_p->kswapd_mem = mem; > + > +    return css_id(&mem->css); > +} > + > +void mem_cgroup_clear_kswapd(struct mem_cgroup *mem) > +{ > +    if (mem) > +        mem->kswapd_wait = NULL; > + > +    return; > +} > + > +wait_queue_head_t *mem_cgroup_kswapd_wait(struct mem_cgroup *mem) > +{ > +    if (!mem) > +        return NULL; > + > +    return mem->kswapd_wait; > +} > + > Âstatic int mem_cgroup_soft_limit_tree_init(void) > Â{ >    Âstruct mem_cgroup_tree_per_node *rtpn; > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 77ac74f..a1a1211 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -2242,6 +2242,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, > Â} > > Âstatic DEFINE_SPINLOCK(kswapds_spinlock); > +#define is_node_kswapd(kswapd_p) (!(kswapd_p)->kswapd_mem) > > Â/* is kswapd sleeping prematurely? */ > Âstatic int sleeping_prematurely(struct kswapd *kswapd, int order, > @@ -2251,11 +2252,16 @@ static int sleeping_prematurely(struct kswapd *kswapd, int order, >    Âunsigned long balanced = 0; >    Âbool all_zones_ok = true; >    Âpg_data_t *pgdat = kswapd->kswapd_pgdat; > +    struct mem_cgroup *mem = kswapd->kswapd_mem; > >    Â/* If a direct reclaimer woke kswapd within HZ/10, it's premature */ >    Âif (remaining) >        Âreturn true; > > +    /* Doesn't support for per-memcg reclaim */ > +    if (mem) > +        return false; > + >    Â/* Check the watermark levels */ >    Âfor (i = 0; i < pgdat->nr_zones; i++) { >        Âstruct zone *zone = pgdat->node_zones + i; > @@ -2598,19 +2604,25 @@ static void kswapd_try_to_sleep(struct kswapd *kswapd_p, int order, >     * go fully to sleep until explicitly woken up. >     */ >    Âif (!sleeping_prematurely(kswapd_p, order, remaining, classzone_idx)) { > -        trace_mm_vmscan_kswapd_sleep(pgdat->node_id); > +        if (is_node_kswapd(kswapd_p)) { > +            trace_mm_vmscan_kswapd_sleep(pgdat->node_id); > > -        /* > -        Â* vmstat counters are not perfectly accurate and the estimated > -        Â* value for counters such as NR_FREE_PAGES can deviate from the > -        Â* true value by nr_online_cpus * threshold. To avoid the zone > -        Â* watermarks being breached while under pressure, we reduce the > -        Â* per-cpu vmstat threshold while kswapd is awake and restore > -        Â* them before going back to sleep. > -        Â*/ > -        set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); > -        schedule(); > -        set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); > +            /* > +            Â* vmstat counters are not perfectly accurate and the > +            Â* estimated value for counters such as NR_FREE_PAGES > +            Â* can deviate from the true value by nr_online_cpus * > +            Â* threshold. To avoid the zone watermarks being > +            Â* breached while under pressure, we reduce the per-cpu > +            Â* vmstat threshold while kswapd is awake and restore > +            Â* them before going back to sleep. > +            Â*/ > +            set_pgdat_percpu_threshold(pgdat, > +                         Âcalculate_normal_threshold); > +            schedule(); > +            set_pgdat_percpu_threshold(pgdat, > +                        calculate_pressure_threshold); > +        } else > +            schedule(); >    Â} else { >        Âif (remaining) >            Âcount_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); > @@ -2620,6 +2632,12 @@ static void kswapd_try_to_sleep(struct kswapd *kswapd_p, int order, >    Âfinish_wait(wait_h, &wait); > Â} > > +static unsigned long balance_mem_cgroup_pgdat(struct mem_cgroup *mem_cont, > +                            int order) > +{ > +    return 0; > +} > + > Â/* > Â* The background pageout daemon, started as a kernel thread > Â* from the init process. > @@ -2639,6 +2657,7 @@ int kswapd(void *p) >    Âint classzone_idx; >    Âstruct kswapd *kswapd_p = (struct kswapd *)p; >    Âpg_data_t *pgdat = kswapd_p->kswapd_pgdat; > +    struct mem_cgroup *mem = kswapd_p->kswapd_mem; >    Âwait_queue_head_t *wait_h = &kswapd_p->kswapd_wait; >    Âstruct task_struct *tsk = current; > > @@ -2649,10 +2668,12 @@ int kswapd(void *p) > >    Âlockdep_set_current_reclaim_state(GFP_KERNEL); > > -    BUG_ON(pgdat->kswapd_wait != wait_h); > -    cpumask = cpumask_of_node(pgdat->node_id); > -    if (!cpumask_empty(cpumask)) > -        set_cpus_allowed_ptr(tsk, cpumask); > +    if (is_node_kswapd(kswapd_p)) { > +        BUG_ON(pgdat->kswapd_wait != wait_h); > +        cpumask = cpumask_of_node(pgdat->node_id); > +        if (!cpumask_empty(cpumask)) > +            set_cpus_allowed_ptr(tsk, cpumask); > +    } >    Âcurrent->reclaim_state = &reclaim_state; > >    Â/* > @@ -2677,24 +2698,29 @@ int kswapd(void *p) >        Âint new_classzone_idx; >        Âint ret; > > -        new_order = pgdat->kswapd_max_order; > -        new_classzone_idx = pgdat->classzone_idx; > -        pgdat->kswapd_max_order = 0; > -        pgdat->classzone_idx = MAX_NR_ZONES - 1; > -        if (order < new_order || classzone_idx > new_classzone_idx) { > -            /* > -            Â* Don't sleep if someone wants a larger 'order' > -            Â* allocation or has tigher zone constraints > -            Â*/ > -            order = new_order; > -            classzone_idx = new_classzone_idx; > -        } else { > -            kswapd_try_to_sleep(kswapd_p, order, classzone_idx); > -            order = pgdat->kswapd_max_order; > -            classzone_idx = pgdat->classzone_idx; > +        if (is_node_kswapd(kswapd_p)) { > +            new_order = pgdat->kswapd_max_order; > +            new_classzone_idx = pgdat->classzone_idx; >            Âpgdat->kswapd_max_order = 0; >            Âpgdat->classzone_idx = MAX_NR_ZONES - 1; > -        } > +            if (order < new_order || > +                    classzone_idx > new_classzone_idx) { > +                /* > +                Â* Don't sleep if someone wants a larger 'order' > +                Â* allocation or has tigher zone constraints > +                Â*/ > +                order = new_order; > +                classzone_idx = new_classzone_idx; > +            } else { > +                kswapd_try_to_sleep(kswapd_p, order, > +                          classzone_idx); > +                order = pgdat->kswapd_max_order; > +                classzone_idx = pgdat->classzone_idx; > +                pgdat->kswapd_max_order = 0; > +                pgdat->classzone_idx = MAX_NR_ZONES - 1; > +            } > +        } else > +            kswapd_try_to_sleep(kswapd_p, order, classzone_idx); > >        Âret = try_to_freeze(); >        Âif (kthread_should_stop()) > @@ -2705,8 +2731,13 @@ int kswapd(void *p) >         * after returning from the refrigerator >         */ >        Âif (!ret) { > -            trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); > -            order = balance_pgdat(pgdat, order, &classzone_idx); > +            if (is_node_kswapd(kswapd_p)) { > +                trace_mm_vmscan_kswapd_wake(pgdat->node_id, > +                                order); > +                order = balance_pgdat(pgdat, order, > +                            &classzone_idx); > +            } else > +                balance_mem_cgroup_pgdat(mem, order); >        Â} >    Â} >    Âreturn 0; > @@ -2853,30 +2884,53 @@ static int __devinit cpu_callback(struct notifier_block *nfb, > Â* This kswapd start function will be called by init and node-hot-add. > Â* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added. > Â*/ > -int kswapd_run(int nid) > +int kswapd_run(int nid, struct mem_cgroup *mem) > Â{ > -    pg_data_t *pgdat = NODE_DATA(nid); >    Âstruct task_struct *kswapd_thr; > +    pg_data_t *pgdat = NULL; >    Âstruct kswapd *kswapd_p; > +    static char name[TASK_COMM_LEN]; > +    int memcg_id; >    Âint ret = 0; > > -    if (pgdat->kswapd_wait) > -        return 0; > +    if (!mem) { > +        pgdat = NODE_DATA(nid); > +        if (pgdat->kswapd_wait) > +            return ret; > +    } > >    Âkswapd_p = kzalloc(sizeof(struct kswapd), GFP_KERNEL); >    Âif (!kswapd_p) >        Âreturn -ENOMEM; > >    Âinit_waitqueue_head(&kswapd_p->kswapd_wait); > -    pgdat->kswapd_wait = &kswapd_p->kswapd_wait; > -    kswapd_p->kswapd_pgdat = pgdat; > > -    kswapd_thr = kthread_run(kswapd, kswapd_p, "kswapd%d", nid); > +    if (!mem) { > +        pgdat->kswapd_wait = &kswapd_p->kswapd_wait; > +        kswapd_p->kswapd_pgdat = pgdat; > +        snprintf(name, TASK_COMM_LEN, "kswapd_%d", nid); > +    } else { > +        memcg_id = mem_cgroup_init_kswapd(mem, kswapd_p); > +        if (!memcg_id) { > +            kfree(kswapd_p); > +            return ret; > +        } > +        snprintf(name, TASK_COMM_LEN, "memcg_%d", memcg_id); > +    } > + > +    kswapd_thr = kthread_run(kswapd, kswapd_p, name); >    Âif (IS_ERR(kswapd_thr)) { >        Â/* failure at boot is fatal */ >        ÂBUG_ON(system_state == SYSTEM_BOOTING); > -        printk("Failed to start kswapd on node %d\n",nid); > -        pgdat->kswapd_wait = NULL; > +        if (!mem) { > +            printk(KERN_ERR "Failed to start kswapd on node %d\n", > +                                nid); > +            pgdat->kswapd_wait = NULL; > +        } else { > +            printk(KERN_ERR "Failed to start kswapd on memcg %d\n", > +                                memcg_id); > +            mem_cgroup_clear_kswapd(mem); > +        } >        Âkfree(kswapd_p); >        Âret = -1; >    Â} else > @@ -2887,16 +2941,18 @@ int kswapd_run(int nid) > Â/* > Â* Called by memory hotplug when all memory in a node is offlined. > Â*/ > -void kswapd_stop(int nid) > +void kswapd_stop(int nid, struct mem_cgroup *mem) > Â{ >    Âstruct task_struct *kswapd_thr = NULL; >    Âstruct kswapd *kswapd_p = NULL; >    Âwait_queue_head_t *wait; > > -    pg_data_t *pgdat = NODE_DATA(nid); > - >    Âspin_lock(&kswapds_spinlock); > -    wait = pgdat->kswapd_wait; > +    if (!mem) > +        wait = NODE_DATA(nid)->kswapd_wait; > +    else > +        wait = mem_cgroup_kswapd_wait(mem); > + >    Âif (wait) { >        Âkswapd_p = container_of(wait, struct kswapd, kswapd_wait); >        Âkswapd_thr = kswapd_p->kswapd_task; > @@ -2916,7 +2972,7 @@ static int __init kswapd_init(void) > >    Âswap_setup(); >    Âfor_each_node_state(nid, N_HIGH_MEMORY) > -        kswapd_run(nid); > +        kswapd_run(nid, NULL); >    Âhotcpu_notifier(cpu_callback, 0); >    Âreturn 0; > Â} > -- > 1.7.3.1 > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@xxxxxxxxxx ÂFor more info on Linux MM, > see: http://www.linux-mm.org/ . > Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ > Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a> > ÿô.nÇ·ÿ±ég¬±¨Âaþé»®&Þ)î¦þ)íèh¨è&£ù¢¸ÿæ¢ú»þÇþm§ÿÿÃÿ)î¦þàbnö¥yÊ{^®wr«ë&§iÖ²('Ûÿÿìm éê¯Ãí¢ÿÚ·ÚýiÉ¢¸ÿý½§$þàÿ