There have been deadlock reports[1, 2] where put_page is called from softirq context and this causes trouble with the hugetlb_lock, as well as potentially the subpool lock. For such an unlikely scenario, lets not add irq dancing overhead to the lock+unlock operations, which could incur in expensive instruction dependencies, particularly when considering hard-irq safety. For example PUSHF+POPF on x86. Instead, just use a workqueue and do the free_huge_page() in regular task context. [1] https://lore.kernel.org/lkml/20191211194615.18502-1-longman@xxxxxxxxxx/ [2] https://lore.kernel.org/lkml/20180905112341.21355-1-aneesh.kumar@xxxxxxxxxxxxx/ Reported-by: Waiman Long <longman@xxxxxxxxxx> Reported-by: Aneesh Kumar K.V <aneesh.kumar@xxxxxxxxxxxxx> Signed-off-by: Davidlohr Bueso <dbueso@xxxxxxx> --- - Changes from v1: Only use wq when in_interrupt(), otherwise business as usual. Also include the proper header file. - While I have not reproduced this issue, the v1 using wq passes all hugetlb related tests in ltp. mm/hugetlb.c | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ac65bb5e38ac..f28cf601938d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -27,6 +27,7 @@ #include <linux/swapops.h> #include <linux/jhash.h> #include <linux/numa.h> +#include <linux/workqueue.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -1136,7 +1137,13 @@ static inline void ClearPageHugeTemporary(struct page *page) page[2].mapping = NULL; } -void free_huge_page(struct page *page) +static struct workqueue_struct *hugetlb_free_page_wq; +struct hugetlb_free_page_work { + struct page *page; + struct work_struct work; +}; + +static void __free_huge_page(struct page *page) { /* * Can't pass hstate in here because it is called from the @@ -1199,6 +1206,36 @@ void free_huge_page(struct page *page) spin_unlock(&hugetlb_lock); } +static void free_huge_page_workfn(struct work_struct *work) +{ + struct page *page; + + page = container_of(work, struct hugetlb_free_page_work, work)->page; + __free_huge_page(page); +} + +void free_huge_page(struct page *page) +{ + if (unlikely(in_interrupt())) { + /* + * While uncommon, free_huge_page() can be at least + * called from softirq context, defer freeing such + * that the hugetlb_lock and spool->lock need not have + * to deal with irq dances just for this. + */ + struct hugetlb_free_page_work work; + + work.page = page; + INIT_WORK_ONSTACK(&work.work, free_huge_page_workfn); + queue_work(hugetlb_free_page_wq, &work.work); + + /* wait until the huge page freeing is done */ + flush_work(&work.work); + destroy_work_on_stack(&work.work); + } else + __free_huge_page(page); +} + static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { INIT_LIST_HEAD(&page->lru); @@ -2816,6 +2853,12 @@ static int __init hugetlb_init(void) for (i = 0; i < num_fault_mutexes; i++) mutex_init(&hugetlb_fault_mutex_table[i]); + + hugetlb_free_page_wq = alloc_workqueue("hugetlb_free_page_wq", + WQ_MEM_RECLAIM, 0); + if (!hugetlb_free_page_wq) + return -ENOMEM; + return 0; } subsys_initcall(hugetlb_init); -- 2.16.4