[patch 1/2] mm: make page pfmemalloc check more robust

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Michal Hocko <mhocko@xxxxxxxx>
Subject: mm: make page pfmemalloc check more robust

The patch c48a11c7ad26 ("netvm: propagate page->pfmemalloc to skb") added
the checks for page->pfmemalloc to __skb_fill_page_desc():

        if (page->pfmemalloc && !page->mapping)
                skb->pfmemalloc = true;

It assumes page->mapping == NULL implies that page->pfmemalloc can be
trusted.  However, __delete_from_page_cache() can set set page->mapping to
NULL and leave page->index value alone.  Due to being in union, a non-zero
page->index will be interpreted as true page->pfmemalloc.

So the assumption is invalid if the networking code can see such a page. 
And it seems it can.  We have encountered this with a NFS over loopback
setup when such a page is attached to a new skbuf.  There is no copying
going on in this case so the page confuses __skb_fill_page_desc which
interprets the index as pfmemalloc flag and the network stack drops
packets that have been allocated using the reserves unless they are to be
queued on sockets handling the swapping which is the case here and that
leads to hangs when the nfs client waits for a response from the server
which has been dropped and thus never arrive.

The struct page is already heavily packed so rather than finding another
hole to put it in, let's do a trick instead.  We can reuse the index again
but define it to an impossible value (-1UL).  This is the page index so it
should never see the value that large.  Replace all direct users of
page->pfmemalloc by page_is_pfmemalloc which will hide this nastiness from
unspoiled eyes.

The information will get lost if somebody wants to use page->index
obviously but that was the case before and the original code expected that
the information should be persisted somewhere else if that is really
needed (e.g.  what SLAB and SLUB do).

[akpm@xxxxxxxxxxxxxxxxxxxx: fix blooper in slub]
Fixes: c48a11c7ad26 ("netvm: propagate page->pfmemalloc to skb")
Signed-off-by: Michal Hocko <mhocko@xxxxxxxx>
Debugged-by: Vlastimil Babka <vbabka@xxxxxxxx>
Debugged-by: Jiri Bohac <jbohac@xxxxxxxx>
Cc: Eric Dumazet <eric.dumazet@xxxxxxxxx>
Cc: David Miller <davem@xxxxxxxxxxxxx>
Acked-by: Mel Gorman <mgorman@xxxxxxx>
Cc: <stable@xxxxxxxxxxxxxxx>	[3.6+]
Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
---

 drivers/net/ethernet/intel/fm10k/fm10k_main.c     |    2 
 drivers/net/ethernet/intel/igb/igb_main.c         |    2 
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     |    2 
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |    2 
 include/linux/mm.h                                |   28 ++++++++++++
 include/linux/mm_types.h                          |    9 ---
 include/linux/skbuff.h                            |   14 ++----
 mm/page_alloc.c                                   |    9 ++-
 mm/slab.c                                         |    4 -
 mm/slub.c                                         |    2 
 net/core/skbuff.c                                 |    2 
 11 files changed, 47 insertions(+), 29 deletions(-)

diff -puN drivers/net/ethernet/intel/fm10k/fm10k_main.c~mm-make-page-pfmemalloc-check-more-robust drivers/net/ethernet/intel/fm10k/fm10k_main.c
--- a/drivers/net/ethernet/intel/fm10k/fm10k_main.c~mm-make-page-pfmemalloc-check-more-robust
+++ a/drivers/net/ethernet/intel/fm10k/fm10k_main.c
@@ -216,7 +216,7 @@ static void fm10k_reuse_rx_page(struct f
 
 static inline bool fm10k_page_is_reserved(struct page *page)
 {
-	return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc;
+	return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
 }
 
 static bool fm10k_can_reuse_rx_page(struct fm10k_rx_buffer *rx_buffer,
diff -puN drivers/net/ethernet/intel/igb/igb_main.c~mm-make-page-pfmemalloc-check-more-robust drivers/net/ethernet/intel/igb/igb_main.c
--- a/drivers/net/ethernet/intel/igb/igb_main.c~mm-make-page-pfmemalloc-check-more-robust
+++ a/drivers/net/ethernet/intel/igb/igb_main.c
@@ -6566,7 +6566,7 @@ static void igb_reuse_rx_page(struct igb
 
 static inline bool igb_page_is_reserved(struct page *page)
 {
-	return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc;
+	return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
 }
 
 static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
diff -puN drivers/net/ethernet/intel/ixgbe/ixgbe_main.c~mm-make-page-pfmemalloc-check-more-robust drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c~mm-make-page-pfmemalloc-check-more-robust
+++ a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1832,7 +1832,7 @@ static void ixgbe_reuse_rx_page(struct i
 
 static inline bool ixgbe_page_is_reserved(struct page *page)
 {
-	return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc;
+	return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
 }
 
 /**
diff -puN drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c~mm-make-page-pfmemalloc-check-more-robust drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c~mm-make-page-pfmemalloc-check-more-robust
+++ a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -765,7 +765,7 @@ static void ixgbevf_reuse_rx_page(struct
 
 static inline bool ixgbevf_page_is_reserved(struct page *page)
 {
-	return (page_to_nid(page) != numa_mem_id()) || page->pfmemalloc;
+	return (page_to_nid(page) != numa_mem_id()) || page_is_pfmemalloc(page);
 }
 
 /**
diff -puN include/linux/mm.h~mm-make-page-pfmemalloc-check-more-robust include/linux/mm.h
--- a/include/linux/mm.h~mm-make-page-pfmemalloc-check-more-robust
+++ a/include/linux/mm.h
@@ -1003,6 +1003,34 @@ static inline int page_mapped(struct pag
 }
 
 /*
+ * Return true only if the page has been allocated with
+ * ALLOC_NO_WATERMARKS and the low watermark was not
+ * met implying that the system is under some pressure.
+ */
+static inline bool page_is_pfmemalloc(struct page *page)
+{
+	/*
+	 * Page index cannot be this large so this must be
+	 * a pfmemalloc page.
+	 */
+	return page->index == -1UL;
+}
+
+/*
+ * Only to be called by the page allocator on a freshly allocated
+ * page.
+ */
+static inline void set_page_pfmemalloc(struct page *page)
+{
+	page->index = -1UL;
+}
+
+static inline void clear_page_pfmemalloc(struct page *page)
+{
+	page->index = 0;
+}
+
+/*
  * Different kinds of faults, as returned by handle_mm_fault().
  * Used to decide whether a process gets delivered SIGBUS or
  * just gets major/minor fault counters bumped up.
diff -puN include/linux/mm_types.h~mm-make-page-pfmemalloc-check-more-robust include/linux/mm_types.h
--- a/include/linux/mm_types.h~mm-make-page-pfmemalloc-check-more-robust
+++ a/include/linux/mm_types.h
@@ -63,15 +63,6 @@ struct page {
 		union {
 			pgoff_t index;		/* Our offset within mapping. */
 			void *freelist;		/* sl[aou]b first free object */
-			bool pfmemalloc;	/* If set by the page allocator,
-						 * ALLOC_NO_WATERMARKS was set
-						 * and the low watermark was not
-						 * met implying that the system
-						 * is under some pressure. The
-						 * caller should try ensure
-						 * this page is only used to
-						 * free other pages.
-						 */
 		};
 
 		union {
diff -puN include/linux/skbuff.h~mm-make-page-pfmemalloc-check-more-robust include/linux/skbuff.h
--- a/include/linux/skbuff.h~mm-make-page-pfmemalloc-check-more-robust
+++ a/include/linux/skbuff.h
@@ -1602,20 +1602,16 @@ static inline void __skb_fill_page_desc(
 	skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
 
 	/*
-	 * Propagate page->pfmemalloc to the skb if we can. The problem is
-	 * that not all callers have unique ownership of the page. If
-	 * pfmemalloc is set, we check the mapping as a mapping implies
-	 * page->index is set (index and pfmemalloc share space).
-	 * If it's a valid mapping, we cannot use page->pfmemalloc but we
-	 * do not lose pfmemalloc information as the pages would not be
-	 * allocated using __GFP_MEMALLOC.
+	 * Propagate page pfmemalloc to the skb if we can. The problem is
+	 * that not all callers have unique ownership of the page but rely
+	 * on page_is_pfmemalloc doing the right thing(tm).
 	 */
 	frag->page.p		  = page;
 	frag->page_offset	  = off;
 	skb_frag_size_set(frag, size);
 
 	page = compound_head(page);
-	if (page->pfmemalloc && !page->mapping)
+	if (page_is_pfmemalloc(page))
 		skb->pfmemalloc	= true;
 }
 
@@ -2263,7 +2259,7 @@ static inline struct page *dev_alloc_pag
 static inline void skb_propagate_pfmemalloc(struct page *page,
 					     struct sk_buff *skb)
 {
-	if (page && page->pfmemalloc)
+	if (page_is_pfmemalloc(page))
 		skb->pfmemalloc = true;
 }
 
diff -puN mm/page_alloc.c~mm-make-page-pfmemalloc-check-more-robust mm/page_alloc.c
--- a/mm/page_alloc.c~mm-make-page-pfmemalloc-check-more-robust
+++ a/mm/page_alloc.c
@@ -1343,12 +1343,15 @@ static int prep_new_page(struct page *pa
 	set_page_owner(page, order, gfp_flags);
 
 	/*
-	 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to
+	 * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
 	 * allocate the page. The expectation is that the caller is taking
 	 * steps that will free more memory. The caller should avoid the page
 	 * being used for !PFMEMALLOC purposes.
 	 */
-	page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
+	if (alloc_flags & ALLOC_NO_WATERMARKS)
+		set_page_pfmemalloc(page);
+	else
+		clear_page_pfmemalloc(page);
 
 	return 0;
 }
@@ -3345,7 +3348,7 @@ refill:
 		atomic_add(size - 1, &page->_count);
 
 		/* reset page count bias and offset to start of new frag */
-		nc->pfmemalloc = page->pfmemalloc;
+		nc->pfmemalloc = page_is_pfmemalloc(page);
 		nc->pagecnt_bias = size;
 		nc->offset = size;
 	}
diff -puN mm/slab.c~mm-make-page-pfmemalloc-check-more-robust mm/slab.c
--- a/mm/slab.c~mm-make-page-pfmemalloc-check-more-robust
+++ a/mm/slab.c
@@ -1603,7 +1603,7 @@ static struct page *kmem_getpages(struct
 	}
 
 	/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
-	if (unlikely(page->pfmemalloc))
+	if (page_is_pfmemalloc(page))
 		pfmemalloc_active = true;
 
 	nr_pages = (1 << cachep->gfporder);
@@ -1614,7 +1614,7 @@ static struct page *kmem_getpages(struct
 		add_zone_page_state(page_zone(page),
 			NR_SLAB_UNRECLAIMABLE, nr_pages);
 	__SetPageSlab(page);
-	if (page->pfmemalloc)
+	if (page_is_pfmemalloc(page))
 		SetPageSlabPfmemalloc(page);
 
 	if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
diff -puN mm/slub.c~mm-make-page-pfmemalloc-check-more-robust mm/slub.c
--- a/mm/slub.c~mm-make-page-pfmemalloc-check-more-robust
+++ a/mm/slub.c
@@ -1427,7 +1427,7 @@ static struct page *new_slab(struct kmem
 	inc_slabs_node(s, page_to_nid(page), page->objects);
 	page->slab_cache = s;
 	__SetPageSlab(page);
-	if (page->pfmemalloc)
+	if (page_is_pfmemalloc(page))
 		SetPageSlabPfmemalloc(page);
 
 	start = page_address(page);
diff -puN net/core/skbuff.c~mm-make-page-pfmemalloc-check-more-robust net/core/skbuff.c
--- a/net/core/skbuff.c~mm-make-page-pfmemalloc-check-more-robust
+++ a/net/core/skbuff.c
@@ -340,7 +340,7 @@ struct sk_buff *build_skb(void *data, un
 
 	if (skb && frag_size) {
 		skb->head_frag = 1;
-		if (virt_to_head_page(data)->pfmemalloc)
+		if (page_is_pfmemalloc(virt_to_head_page(data)))
 			skb->pfmemalloc = 1;
 	}
 	return skb;
_
--
To unsubscribe from this list: send the line "unsubscribe stable" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux Kernel]     [Kernel Development Newbies]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite Hiking]     [Linux Kernel]     [Linux SCSI]