On Sun 21-12-14 09:35:04, Dave Chinner wrote: [...] > Oh, boy. > > struct page *grab_cache_page_write_begin(struct address_space *mapping, > pgoff_t index, unsigned flags) > { > struct page *page; > int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT; > > if (flags & AOP_FLAG_NOFS) > fgp_flags |= FGP_NOFS; > > page = pagecache_get_page(mapping, index, fgp_flags, > mapping_gfp_mask(mapping), > GFP_KERNEL); > if (page) > wait_for_stable_page(page); > > return page; > } > > There are *3* different memory allocation controls passed to > pagecache_get_page. The first is via AOP_FLAG_NOFS, where the caller > explicitly says this allocation is in filesystem context with locks > held, and so all allocations need to be done in GFP_NOFS context. > This is used to override the second and third gfp parameters. > > The second is mapping_gfp_mask(mapping), which is the *default > allocation context* the filesystem wants the page cache to use for > allocating pages to the mapping. > > The third is a hard coded GFP_KERNEL, which is used for radix tree > node allocation. > > Why are there separate allocation contexts for the radix tree nodes > and the page cache pages when they are done under *exactly the same > caller context*? Either we are allowed to recurse into the > filesystem or we aren't, and the inode mapping mask defines that > context for all page cache allocations, not just the pages > themselves. > > And to point out how many filesystems this affects, > the loop device, btrfs, f2fs, gfs2, jfs, logfs, nil2fs, reiserfs > and XFS all use this mapping default to clear __GFP_FS from > page cache allocations. Only ext4 and gfs2 use AOP_FLAG_NOFS in > their ->write_begin callouts to prevent recusrion. > > IOWs, grab_cache_page_write_begin/pagecache_get_page multiple > allocation contexts are just wrong. It does not match the way > filesystems are informing the page cache of allocation context to > avoid recursion (for avoiding stack overflow and/or deadlock). > AOP_FLAG_NOFS should go away, and all filesystems should modify the > mapping gfp mask to set their allocation context. If should be used > *everywhere* pages are allocated into the page cache, and for all > allocations related to tracking those allocated pages. I guess the following would be a first simple step to remove the bug you are mentioning above. It would be simple enough to put into stable as well. What do you think? --- >From 24d7865f17a235653014ceea7757f2a71b309789 Mon Sep 17 00:00:00 2001 From: Michal Hocko <mhocko@xxxxxxx> Date: Mon, 29 Dec 2014 17:42:17 +0100 Subject: [PATCH] mm: get rid of radix tree gfp mask for pagecache_get_page 2457aec63745 (mm: non-atomically mark page accessed during page cache allocation where possible) has added a separate parameter for specifying gfp mask for radix tree allocations. Not only this is less than optimal from the API point of view because it is error prone, it is also buggy currently because grab_cache_page_write_begin is using GFP_KERNEL for radix tree and if fgp_flags doesn't contain FGP_NOFS (mostly controlled by fs by AOP_FLAG_NOFS flag) but the mapping_gfp_mask has __GFP_FS cleared then the radix tree allocation wouldn't obey the restriction and might recurse into filesystem and cause deadlocks. This is the case for most filesystems unfortunately because only ext4 and gfs2 are using AOP_FLAG_NOFS. Let's simply remove radix_gfp_mask parameter because the allocation context is same for both page cache and for the radix tree. Long term it is more preferable to convert remaining users of AOP_FLAG_NOFS to use mapping_gfp_mask instead and simplify this interface even further. Reported-by: Dave Chinner <david@xxxxxxxxxxxxx> Signed-off-by: Michal Hocko <mhocko@xxxxxxx> --- include/linux/pagemap.h | 13 ++++++------- mm/filemap.c | 28 +++++++++++----------------- 2 files changed, 17 insertions(+), 24 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 7ea069cd3257..4b3736f7065c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -251,7 +251,7 @@ pgoff_t page_cache_prev_hole(struct address_space *mapping, #define FGP_NOWAIT 0x00000020 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, - int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask); + int fgp_flags, gfp_t cache_gfp_mask); /** * find_get_page - find and get a page reference @@ -266,13 +266,13 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, static inline struct page *find_get_page(struct address_space *mapping, pgoff_t offset) { - return pagecache_get_page(mapping, offset, 0, 0, 0); + return pagecache_get_page(mapping, offset, 0, 0); } static inline struct page *find_get_page_flags(struct address_space *mapping, pgoff_t offset, int fgp_flags) { - return pagecache_get_page(mapping, offset, fgp_flags, 0, 0); + return pagecache_get_page(mapping, offset, fgp_flags, 0); } /** @@ -292,7 +292,7 @@ static inline struct page *find_get_page_flags(struct address_space *mapping, static inline struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) { - return pagecache_get_page(mapping, offset, FGP_LOCK, 0, 0); + return pagecache_get_page(mapping, offset, FGP_LOCK, 0); } /** @@ -319,7 +319,7 @@ static inline struct page *find_or_create_page(struct address_space *mapping, { return pagecache_get_page(mapping, offset, FGP_LOCK|FGP_ACCESSED|FGP_CREAT, - gfp_mask, gfp_mask & GFP_RECLAIM_MASK); + gfp_mask); } /** @@ -340,8 +340,7 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, { return pagecache_get_page(mapping, index, FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, - mapping_gfp_mask(mapping), - GFP_NOFS); + mapping_gfp_mask(mapping)); } struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); diff --git a/mm/filemap.c b/mm/filemap.c index e8905bc3cbd7..af2731cd919f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1046,8 +1046,7 @@ EXPORT_SYMBOL(find_lock_entry); * @mapping: the address_space to search * @offset: the page index * @fgp_flags: PCG flags - * @cache_gfp_mask: gfp mask to use for the page cache data page allocation - * @radix_gfp_mask: gfp mask to use for radix tree node allocation + * @gfp_mask: gfp mask to use for the page cache data page allocation * * Looks up the page cache slot at @mapping & @offset. * @@ -1056,11 +1055,9 @@ EXPORT_SYMBOL(find_lock_entry); * FGP_ACCESSED: the page will be marked accessed * FGP_LOCK: Page is return locked * FGP_CREAT: If page is not present then a new page is allocated using - * @cache_gfp_mask and added to the page cache and the VM's LRU - * list. If radix tree nodes are allocated during page cache - * insertion then @radix_gfp_mask is used. The page is returned - * locked and with an increased refcount. Otherwise, %NULL is - * returned. + * @gfp_mask and added to the page cache and the VM's LRU + * list. The page is returned locked and with an increased + * refcount. Otherwise, %NULL is returned. * * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even * if the GFP flags specified for FGP_CREAT are atomic. @@ -1068,7 +1065,7 @@ EXPORT_SYMBOL(find_lock_entry); * If there is a page cache page, it is returned with an increased refcount. */ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, - int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask) + int fgp_flags, gfp_t gfp_mask) { struct page *page; @@ -1105,13 +1102,11 @@ no_page: if (!page && (fgp_flags & FGP_CREAT)) { int err; if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) - cache_gfp_mask |= __GFP_WRITE; - if (fgp_flags & FGP_NOFS) { - cache_gfp_mask &= ~__GFP_FS; - radix_gfp_mask &= ~__GFP_FS; - } + gfp_mask |= __GFP_WRITE; + if (fgp_flags & FGP_NOFS) + gfp_mask &= ~__GFP_FS; - page = __page_cache_alloc(cache_gfp_mask); + page = __page_cache_alloc(gfp_mask); if (!page) return NULL; @@ -1122,7 +1117,7 @@ no_page: if (fgp_flags & FGP_ACCESSED) __SetPageReferenced(page); - err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask); + err = add_to_page_cache_lru(page, mapping, offset, gfp_mask); if (unlikely(err)) { page_cache_release(page); page = NULL; @@ -2443,8 +2438,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, fgp_flags |= FGP_NOFS; page = pagecache_get_page(mapping, index, fgp_flags, - mapping_gfp_mask(mapping), - GFP_KERNEL); + mapping_gfp_mask(mapping)); if (page) wait_for_stable_page(page); -- 2.1.4 -- Michal Hocko SUSE Labs -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>