On Sat, Apr 16, 2011 at 8:23 AM, Ying Han <yinghan@xxxxxxxxxx> wrote: > This is the main loop of per-memcg background reclaim which is implemented in > function balance_mem_cgroup_pgdat(). > > The function performs a priority loop similar to global reclaim. During each > iteration it invokes balance_pgdat_node() for all nodes on the system, which > is another new function performs background reclaim per node. After reclaiming > each node, it checks mem_cgroup_watermark_ok() and breaks the priority loop if > it returns true. > > changelog v5..v4: > 1. remove duplicate check on nodes_empty() > 2. add logic to check if the per-memcg lru is empty on the zone. > 3. make per-memcg kswapd to reclaim SWAP_CLUSTER_MAX per zone. It make senses > since it helps to balance the pressure across zones within the memcg. > > changelog v4..v3: > 1. split the select_victim_node and zone_unreclaimable to a seperate patches > 2. remove the logic tries to do zone balancing. > > changelog v3..v2: > 1. change mz->all_unreclaimable to be boolean. > 2. define ZONE_RECLAIMABLE_RATE macro shared by zone and per-memcg reclaim. > 3. some more clean-up. > > changelog v2..v1: > 1. move the per-memcg per-zone clear_unreclaimable into uncharge stage. > 2. shared the kswapd_run/kswapd_stop for per-memcg and global background > reclaim. > 3. name the per-memcg memcg as "memcg-id" (css->id). And the global kswapd > keeps the same name. > 4. fix a race on kswapd_stop while the per-memcg-per-zone info could be accessed > after freeing. > 5. add the fairness in zonelist where memcg remember the last zone reclaimed > from. > > Signed-off-by: Ying Han <yinghan@xxxxxxxxxx> > --- > Âmm/vmscan.c | Â157 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > Â1 files changed, 157 insertions(+), 0 deletions(-) > > diff --git a/mm/vmscan.c b/mm/vmscan.c > index 06036d2..39e6300 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -47,6 +47,8 @@ > > Â#include <linux/swapops.h> > > +#include <linux/res_counter.h> > + > Â#include "internal.h" > > Â#define CREATE_TRACE_POINTS > @@ -111,6 +113,8 @@ struct scan_control { >     * are scanned. >     */ >    Ânodemask_t   Â*nodemask; > + > +    int priority; > Â}; > > Â#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) > @@ -2631,11 +2635,164 @@ static void kswapd_try_to_sleep(struct kswapd *kswapd_p, int order, >    Âfinish_wait(wait_h, &wait); > Â} > > +#ifdef CONFIG_CGROUP_MEM_RES_CTLR > +/* > + * The function is used for per-memcg LRU. It scanns all the zones of the > + * node and returns the nr_scanned and nr_reclaimed. > + */ > +static void balance_pgdat_node(pg_data_t *pgdat, int order, > +                    struct scan_control *sc) > +{ > +    int i; > +    unsigned long total_scanned = 0; > +    struct mem_cgroup *mem_cont = sc->mem_cgroup; > +    int priority = sc->priority; > +    enum lru_list l; > + > +    /* > +    Â* This dma->highmem order is consistant with global reclaim. > +    Â* We do this because the page allocator works in the opposite > +    Â* direction although memcg user pages are mostly allocated at > +    Â* highmem. > +    Â*/ > +    for (i = 0; i < pgdat->nr_zones; i++) { > +        struct zone *zone = pgdat->node_zones + i; > +        unsigned long scan = 0; > + > +        for_each_evictable_lru(l) > +            scan += mem_cgroup_zone_nr_pages(mem_cont, zone, l); > + > +        if (!populated_zone(zone) || !scan) > +            continue; Do we really need this double check? Isn't only _scan_ check enough? And shouldn't we consider non-swap case? > + > +        sc->nr_scanned = 0; > +        shrink_zone(priority, zone, sc); > +        total_scanned += sc->nr_scanned; > + > +        /* > +        Â* If we've done a decent amount of scanning and > +        Â* the reclaim ratio is low, start doing writepage > +        Â* even in laptop mode > +        Â*/ > +        if (total_scanned > SWAP_CLUSTER_MAX * 2 && > +          total_scanned > sc->nr_reclaimed + sc->nr_reclaimed / 2) { > +            sc->may_writepage = 1; I don't want to add more random write any more although we don't have a trouble of real memory shortage. Do you have any reason to reclaim memory urgently as writing dirty pages? Maybe if we wait a little bit of time, flusher would write out the page. > +        } > +    } > + > +    sc->nr_scanned = total_scanned; > +    return; unnecessary return. > +} > + > +/* > + * Per cgroup background reclaim. > + * TODO: Take off the order since memcg always do order 0 > + */ > +static unsigned long balance_mem_cgroup_pgdat(struct mem_cgroup *mem_cont, > +                       int order) > +{ > +    int i, nid; > +    int start_node; > +    int priority; > +    bool wmark_ok; > +    int loop; > +    pg_data_t *pgdat; > +    nodemask_t do_nodes; > +    unsigned long total_scanned; > +    struct scan_control sc = { > +        .gfp_mask = GFP_KERNEL, > +        .may_unmap = 1, > +        .may_swap = 1, > +        .nr_to_reclaim = SWAP_CLUSTER_MAX, > +        .swappiness = vm_swappiness, > +        .order = order, > +        .mem_cgroup = mem_cont, > +    }; > + > +loop_again: > +    do_nodes = NODE_MASK_NONE; > +    sc.may_writepage = !laptop_mode; I think it depends on urgency(ie, priority, reclaim ratio(#reclaim/#scanning) or something), not laptop_mode in case of memcg. As I said earlier,it wold be better to avoid random write. > +    sc.nr_reclaimed = 0; > +    total_scanned = 0; > + > +    for (priority = DEF_PRIORITY; priority >= 0; priority--) { > +        sc.priority = priority; > +        wmark_ok = false; > +        loop = 0; > + > +        /* The swap token gets in the way of swapout... */ > +        if (!priority) > +            disable_swap_token(); > + > +        if (priority == DEF_PRIORITY) > +            do_nodes = node_states[N_ONLINE]; > + > +        while (1) { > +            nid = mem_cgroup_select_victim_node(mem_cont, > +                            &do_nodes); > + > +            /* Indicate we have cycled the nodelist once Fix comment style. > +            Â* TODO: we might add MAX_RECLAIM_LOOP for preventing > +            Â* kswapd burning cpu cycles. > +            Â*/ > +            if (loop == 0) { > +                start_node = nid; > +                loop++; > +            } else if (nid == start_node) > +                break; > + > +            pgdat = NODE_DATA(nid); > +            balance_pgdat_node(pgdat, order, &sc); > +            total_scanned += sc.nr_scanned; > + > +            /* Set the node which has at least Fix comment style. > +            Â* one reclaimable zone > +            Â*/ > +            for (i = pgdat->nr_zones - 1; i >= 0; i--) { > +                struct zone *zone = pgdat->node_zones + i; > + > +                if (!populated_zone(zone)) > +                    continue; > +            } I can't understand your comment and logic. The comment mentioned reclaimable zone but the logic checks just populated_zone. What's meaning? > +            if (i < 0) > +                node_clear(nid, do_nodes); > + > +            if (mem_cgroup_watermark_ok(mem_cont, > +                            CHARGE_WMARK_HIGH)) { > +                wmark_ok = true; > +                goto out; > +            } > + > +            if (nodes_empty(do_nodes)) { > +                wmark_ok = true; > +                goto out; > +            } > +        } > + > +        if (total_scanned && priority < DEF_PRIORITY - 2) > +            congestion_wait(WRITE, HZ/10); > + > +        if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) > +            break; > +    } > +out: > +    if (!wmark_ok) { > +        cond_resched(); > + > +        try_to_freeze(); > + > +        goto loop_again; > +    } > + > +    return sc.nr_reclaimed; > +} > +#else > Âstatic unsigned long balance_mem_cgroup_pgdat(struct mem_cgroup *mem_cont, >                            Âint order) > Â{ >    Âreturn 0; > Â} > +#endif > > Â/* > Â* The background pageout daemon, started as a kernel thread > -- > 1.7.3.1 > > -- Kind regards, Minchan Kim ÿô.nÇ·ÿ±ég¬±¨Âaþé»®&Þ)î¦þ)íèh¨è&£ù¢¸ÿæ¢ú»þÇþm§ÿÿÃÿ)î¦þàbnö¥yÊ{^®wr«ë&§iÖ²('Ûÿÿìm éê¯Ãí¢ÿÚ·ÚýiÉ¢¸ÿý½§$þàÿ