On Mon 13-05-13 11:10:47, Minchan Kim wrote: [...] > My rough plan is following as, > > 1. Make mctz->lock beging aware of irq by changing spin_lock with > spin_lock_irqsave. I wouldn't be worried about this one as it is on its way out with the soft limit rework (the core uncontroversial part ;)) > 2. Introuduce new argument "locked" in __mem_cgroup_uncharge_common > so that __mem_cgroup_uncharge_common can avoid lock_page_cgroup in > irqcontext to avoid deadlock but caller in irqcontext should be held > it in advance by next patch. > 3. Introduce try_lock_page_cgroup, which will be used __swapcache_free. > 4. __remove_mapping can held a page_cgroup lock in advance before calling > __swapcache_free > > I'd like to listen memcg people's opinions before diving into coding. It should work. It will require some code moving, though. > Signed-off-by: Minchan Kim <minchan@xxxxxxxxxx> > --- > fs/splice.c | 2 +- > include/linux/swap.h | 12 ++++++++++- > mm/swapfile.c | 2 +- > mm/truncate.c | 2 +- > mm/vmscan.c | 56 +++++++++++++++++++++++++++++++++++++++++++--------- > 5 files changed, 61 insertions(+), 13 deletions(-) > > diff --git a/fs/splice.c b/fs/splice.c > index e6b2559..db77694 100644 > --- a/fs/splice.c > +++ b/fs/splice.c > @@ -70,7 +70,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, > * If we succeeded in removing the mapping, set LRU flag > * and return good. > */ > - if (remove_mapping(mapping, page)) { > + if (remove_mapping(mapping, page, false)) { > buf->flags |= PIPE_BUF_FLAG_LRU; > return 0; > } > diff --git a/include/linux/swap.h b/include/linux/swap.h > index ca031f7..eb126d2 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -274,7 +274,8 @@ extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, > unsigned long *nr_scanned); > extern unsigned long shrink_all_memory(unsigned long nr_pages); > extern int vm_swappiness; > -extern int remove_mapping(struct address_space *mapping, struct page *page); > +extern int remove_mapping(struct address_space *mapping, struct page *page, > + bool irqcontext); > extern unsigned long vm_total_pages; > > #ifdef CONFIG_NUMA > @@ -407,6 +408,9 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) > } > #endif > > + > +extern struct swap_info_struct *swap_info_get(swp_entry_t entry); > + > #else /* CONFIG_SWAP */ > > #define get_nr_swap_pages() 0L > @@ -430,6 +434,12 @@ static inline void show_swap_cache_info(void) > #define free_swap_and_cache(swp) is_migration_entry(swp) > #define swapcache_prepare(swp) is_migration_entry(swp) > > + > +struct swap_info_struct *swap_info_get(swp_entry_t entry) > +{ > + return NULL; > +} > + > static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) > { > return 0; > diff --git a/mm/swapfile.c b/mm/swapfile.c > index 33ebdd5..8a425d4 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -505,7 +505,7 @@ swp_entry_t get_swap_page_of_type(int type) > return (swp_entry_t) {0}; > } > > -static struct swap_info_struct *swap_info_get(swp_entry_t entry) > +struct swap_info_struct *swap_info_get(swp_entry_t entry) > { > struct swap_info_struct *p; > unsigned long offset, type; > diff --git a/mm/truncate.c b/mm/truncate.c > index c75b736..fa1dc60 100644 > --- a/mm/truncate.c > +++ b/mm/truncate.c > @@ -131,7 +131,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) > if (page_has_private(page) && !try_to_release_page(page, 0)) > return 0; > > - ret = remove_mapping(mapping, page); > + ret = remove_mapping(mapping, page, false); > > return ret; > } > diff --git a/mm/vmscan.c b/mm/vmscan.c > index fa6a853..d14c9be 100644 > --- a/mm/vmscan.c > +++ b/mm/vmscan.c > @@ -450,12 +450,18 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, > * Same as remove_mapping, but if the page is removed from the mapping, it > * gets returned with a refcount of 0. > */ > -static int __remove_mapping(struct address_space *mapping, struct page *page) > +static int __remove_mapping(struct address_space *mapping, struct page *page, > + bool irqcontext) > { > + unsigned long flags; > + > BUG_ON(!PageLocked(page)); > BUG_ON(mapping != page_mapping(page)); > > - spin_lock_irq(&mapping->tree_lock); > + if (irqcontext) > + spin_lock_irqsave(&mapping->tree_lock, flags); > + else > + spin_lock_irq(&mapping->tree_lock); > /* > * The non racy check for a busy page. > * > @@ -490,17 +496,45 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) > } > > if (PageSwapCache(page)) { > + struct swap_info_struct *p; > swp_entry_t swap = { .val = page_private(page) }; > + p = swap_info_get(swap); > + /* > + * If we are irq context, check that we can get a > + * swap_info_strcut->lock before removing the page from > + * swap cache. Because __swapcache_free must be successful. > + * If __swapcache_free can be failed, we should rollback > + * things done by __delete_from_swap_cache and it needs > + * memory allocation for radix tree node in irqcontext > + * That's thing we really want to avoid. > + * TODO : memcg mem_cgroup_uncharge_swapcache handling > + * in irqcontext > + */ > + if (irqcontext && p && !spin_trylock(&p->lock)) { > + page_unfreeze_refs(page, 2); > + goto cannot_free; > + } > + > __delete_from_swap_cache(page); > - spin_unlock_irq(&mapping->tree_lock); > - swapcache_free(swap, page); > + if (irqcontext) { > + spin_unlock_irqrestore(&mapping->tree_lock, flags); > + if (p) > + __swapcache_free(p, swap, page); > + spin_unlock(&p->lock); > + } else { > + spin_unlock_irq(&mapping->tree_lock); > + swapcache_free(swap, page); > + } > } else { > void (*freepage)(struct page *); > > freepage = mapping->a_ops->freepage; > > __delete_from_page_cache(page); > - spin_unlock_irq(&mapping->tree_lock); > + if (irqcontext) > + spin_unlock_irqrestore(&mapping->tree_lock, flags); > + else > + spin_unlock_irq(&mapping->tree_lock); > mem_cgroup_uncharge_cache_page(page); > > if (freepage != NULL) > @@ -510,7 +544,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) > return 1; > > cannot_free: > - spin_unlock_irq(&mapping->tree_lock); > + if (irqcontext) > + spin_unlock_irqrestore(&mapping->tree_lock, flags); > + else > + spin_unlock_irq(&mapping->tree_lock); > return 0; > } > > @@ -520,9 +557,10 @@ cannot_free: > * successfully detached, return 1. Assumes the caller has a single ref on > * this page. > */ > -int remove_mapping(struct address_space *mapping, struct page *page) > +int remove_mapping(struct address_space *mapping, struct page *page, > + bool irqcontext) > { > - if (__remove_mapping(mapping, page)) { > + if (__remove_mapping(mapping, page, irqcontext)) { > /* > * Unfreezing the refcount with 1 rather than 2 effectively > * drops the pagecache ref for us without requiring another > @@ -904,7 +942,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, > } > } > > - if (!mapping || !__remove_mapping(mapping, page)) > + if (!mapping || !__remove_mapping(mapping, page, false)) > goto keep_locked; > > /* > -- > 1.8.2.1 > -- Michal Hocko SUSE Labs -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>