On Fri, Feb 20, 2015 at 09:52:17AM +1100, Dave Chinner wrote: > I will actively work around aanything that causes filesystem memory > pressure to increase the chance of oom killer invocations. The OOM > killer is not a solution - it is, by definition, a loose cannon and > so we should be reducing dependencies on it. Once we have a better-working alternative, sure. > I really don't care about the OOM Killer corner cases - it's > completely the wrong way line of development to be spending time on > and you aren't going to convince me otherwise. The OOM killer a > crutch used to justify having a memory allocation subsystem that > can't provide forward progress guarantee mechanisms to callers that > need it. We can provide this. Are all these callers able to preallocate? --- diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 51bd1e72a917..af81b8a67651 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -380,6 +380,10 @@ extern void free_kmem_pages(unsigned long addr, unsigned int order); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) +void register_private_page(struct page *page, unsigned int order); +int alloc_private_pages(gfp_t gfp_mask, unsigned int order, unsigned int nr); +void free_private_pages(void); + void page_alloc_init(void); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); diff --git a/include/linux/sched.h b/include/linux/sched.h index 6d77432e14ff..1fe390779f23 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1545,6 +1545,8 @@ struct task_struct { #endif /* VM state */ + struct list_head private_pages; + struct reclaim_state *reclaim_state; struct backing_dev_info *backing_dev_info; diff --git a/kernel/fork.c b/kernel/fork.c index cf65139615a0..b6349b0e5da2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1308,6 +1308,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, memset(&p->rss_stat, 0, sizeof(p->rss_stat)); #endif + INIT_LIST_HEAD(&p->private_pages); + p->default_timer_slack_ns = current->timer_slack_ns; task_io_accounting_init(&p->ioac); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a47f0b229a1a..546db4e0da75 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -490,12 +490,10 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, static inline void set_page_order(struct page *page, unsigned int order) { set_page_private(page, order); - __SetPageBuddy(page); } static inline void rmv_page_order(struct page *page) { - __ClearPageBuddy(page); set_page_private(page, 0); } @@ -617,6 +615,7 @@ static inline void __free_one_page(struct page *page, list_del(&buddy->lru); zone->free_area[order].nr_free--; rmv_page_order(buddy); + __ClearPageBuddy(buddy); } combined_idx = buddy_idx & page_idx; page = page + (combined_idx - page_idx); @@ -624,6 +623,7 @@ static inline void __free_one_page(struct page *page, order++; } set_page_order(page, order); + __SetPageBuddy(page); /* * If this is not the largest possible page, check if the buddy @@ -924,6 +924,7 @@ static inline void expand(struct zone *zone, struct page *page, list_add(&page[size].lru, &area->free_list[migratetype]); area->nr_free++; set_page_order(&page[size], high); + __SetPageBuddy(page); } } @@ -1015,6 +1016,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, struct page, lru); list_del(&page->lru); rmv_page_order(page); + __ClearPageBuddy(page); area->nr_free--; expand(zone, page, order, current_order, area, migratetype); set_freepage_migratetype(page, migratetype); @@ -1212,6 +1214,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) /* Remove the page from the freelists */ list_del(&page->lru); rmv_page_order(page); + __ClearPageBuddy(page); expand(zone, page, order, current_order, area, buddy_type); @@ -1598,6 +1601,7 @@ int __isolate_free_page(struct page *page, unsigned int order) list_del(&page->lru); zone->free_area[order].nr_free--; rmv_page_order(page); + __ClearPageBuddy(page); /* Set the pageblock if the isolated page is at least a pageblock */ if (order >= pageblock_order - 1) { @@ -2504,6 +2508,40 @@ retry: return page; } +/* Try to allocate from the caller's private memory reserves */ +static inline struct page * +__alloc_pages_private(gfp_t gfp_mask, unsigned int order, + const struct alloc_context *ac) +{ + unsigned int uninitialized_var(alloc_order); + struct page *page = NULL; + struct page *p; + + /* Dopy, but this is a slowpath right before OOM */ + list_for_each_entry(p, ¤t->private_pages, lru) { + int o = page_order(p); + + if (o >= order && (!page || o < alloc_order)) { + page = p; + alloc_order = o; + } + } + if (!page) + return NULL; + + list_del(&page->lru); + rmv_page_order(page); + + /* Give back the remainder */ + while (alloc_order > order) { + alloc_order--; + set_page_order(&page[1 << alloc_order], alloc_order); + list_add(&page[1 << alloc_order].lru, ¤t->private_pages); + } + + return page; +} + /* * This is called in the allocator slow-path if the allocation request is of * sufficient urgency to ignore watermarks and take other desperate measures @@ -2753,9 +2791,13 @@ retry: /* * If we fail to make progress by freeing individual * pages, but the allocation wants us to keep going, - * start OOM killing tasks. + * dip into private reserves, or start OOM killing. */ if (!did_some_progress) { + page = __alloc_pages_private(gfp_mask, order, ac); + if (page) + goto got_pg; + page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress); if (page) @@ -3046,6 +3088,82 @@ void free_pages_exact(void *virt, size_t size) EXPORT_SYMBOL(free_pages_exact); /** + * alloc_private_pages - allocate private memory reserve pages + * @gfp_mask: gfp flags for the allocations + * @order: order of pages to allocate + * @nr: number of pages to allocate + * + * This allocates @nr pages of order @order as an emergency reserve of + * the calling task, to be used by the page allocator if an allocation + * would otherwise fail. + * + * The caller is responsible for calling free_private_pages() once the + * reserves are no longer required. + */ +int alloc_private_pages(gfp_t gfp_mask, unsigned int order, unsigned int nr) +{ + struct page *page, *page2; + LIST_HEAD(pages); + unsigned int i; + + for (i = 0; i < nr; i++) { + page = alloc_pages(gfp_mask, order); + if (!page) + goto error; + set_page_order(page, order); + list_add(&page->lru, &pages); + } + + list_splice(&pages, ¤t->private_pages); + return 0; + +error: + list_for_each_entry_safe(page, page2, &pages, lru) { + list_del(&page->lru); + rmv_page_order(page); + __free_pages(page, order); + } + return -ENOMEM; +} + +/** + * register_private_page - register a private memory reserve page + * @page: pre-allocated page + * @order: @page's order + * + * This registers @page as an emergency reserve of the calling task, + * to be used by the page allocator if an allocation would otherwise + * fail. + * + * The caller is responsible for calling free_private_pages() once the + * reserves are no longer required. + */ +void register_private_page(struct page *page, unsigned int order) +{ + set_page_order(page, order); + list_add(&page->lru, ¤t->private_pages); +} + +/** + * free_private_pages - free all private memory reserve pages + * + * Frees all (remaining) pages of the calling task's memory reserves + * established by alloc_private_pages() and register_private_page(). + */ +void free_private_pages(void) +{ + struct page *page, *page2; + + list_for_each_entry_safe(page, page2, ¤t->private_pages, lru) { + int order = page_order(page); + + list_del(&page->lru); + rmv_page_order(page); + __free_pages(page, order); + } +} + +/** * nr_free_zone_pages - count number of pages beyond high watermark * @offset: The zone index of the highest zone * @@ -6551,6 +6669,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) #endif list_del(&page->lru); rmv_page_order(page); + __ClearPageBuddy(page); zone->free_area[order].nr_free--; for (i = 0; i < (1 << order); i++) SetPageReserved((page+i)); -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>