Re: [PATCH, RFC] x86/mm/pat: Restore large pages after fragmentation

Mika Penttilä <mika.penttila@xxxxxxxxxxxx> · Fri, 17 Apr 2020 15:05:22 +0300

On 17.4.2020 14.42, Kirill A. Shutemov wrote:
On Fri, Apr 17, 2020 at 03:52:28AM +0300, Mika Penttilä wrote:
Hi!

On 17.4.2020 0.32, Kirill A. Shutemov wrote:
Change of attributes of the pages may lead to fragmentation of direct
mapping over time and performance degradation as result.

With current code it's one way road: kernel tries to avoid splitting
large pages, but it doesn't restore them back even if page attributes
got compatible again.

Any change to the mapping may potentially allow to restore large page.

Hook up into cpa_flush() path to check if there's any pages to be
recovered in PUD_SIZE range around pages we've just touched.

CPUs don't like[1] to have to have TLB entries of different size for the
same memory, but looks like it's okay as long as these entries have
matching attributes[2]. Therefore it's critical to flush TLB before any
following changes to the mapping.

Note that we already allow for multiple TLB entries of different sizes
for the same memory now in split_large_page() path. It's not a new
situation.

set_memory_4k() provides a way to use 4k pages on purpose. Kernel must
not remap such pages as large. Re-use one of software PTE bits to
indicate such pages.

[1] See Erratum 383 of AMD Family 10h Processors
[2] https://lore.kernel.org/linux-mm/1da1b025-cabc-6f04-bde5-e50830d1ecf0@xxxxxxx/

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx>
---
   arch/x86/include/asm/pgtable_types.h |   2 +
   arch/x86/mm/pat/set_memory.c         | 191 ++++++++++++++++++++++++++-
   2 files changed, 188 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b6606fe6cfdf..11ed34804343 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -34,6 +34,7 @@
   #define _PAGE_BIT_CPA_TEST	_PAGE_BIT_SOFTW1
   #define _PAGE_BIT_UFFD_WP	_PAGE_BIT_SOFTW2 /* userfaultfd wrprotected */
   #define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */
+#define _PAGE_BIT_KERNEL_4K	_PAGE_BIT_SOFTW3 /* page must not be converted to large */
   #define _PAGE_BIT_DEVMAP	_PAGE_BIT_SOFTW4
   /* If _PAGE_BIT_PRESENT is clear, we use these: */
@@ -56,6 +57,7 @@
   #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
   #define _PAGE_SPECIAL	(_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
   #define _PAGE_CPA_TEST	(_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
+#define _PAGE_KERNEL_4K	(_AT(pteval_t, 1) << _PAGE_BIT_KERNEL_4K)
   #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
   #define _PAGE_PKEY_BIT0	(_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT0)
   #define _PAGE_PKEY_BIT1	(_AT(pteval_t, 1) << _PAGE_BIT_PKEY_BIT1)
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 5414fabad1ae..7cb04a436d86 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -344,22 +344,56 @@ static void __cpa_flush_tlb(void *data)
   		__flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
   }
+static void restore_large_pages(unsigned long addr, struct list_head *pgtables);
+
+static void cpa_restore_large_pages(struct cpa_data *cpa,
+		struct list_head *pgtables)
+{
+	unsigned long start, addr, end;
+	int i;
+
+	if (cpa->flags & CPA_PAGES_ARRAY) {
+		for (i = 0; i < cpa->numpages; i++)
+			restore_large_pages(__cpa_addr(cpa, i), pgtables);
+		return;
+	}
+
+	start = __cpa_addr(cpa, 0);
+	end = start + PAGE_SIZE * cpa->numpages;
+
+	for (addr = start; addr >= start && addr < end; addr += PUD_SIZE)
+		restore_large_pages(addr, pgtables);
+}
+
   static void cpa_flush(struct cpa_data *data, int cache)
   {
+	LIST_HEAD(pgtables);
+	struct page *page, *tmp;
   	struct cpa_data *cpa = data;
   	unsigned int i;
   	BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
+	cpa_restore_large_pages(data, &pgtables);
+
   	if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
   		cpa_flush_all(cache);
+		list_for_each_entry_safe(page, tmp, &pgtables, lru) {
+			list_del(&page->lru);
+			__free_page(page);
+		}
   		return;
   	}
-	if (cpa->numpages <= tlb_single_page_flush_ceiling)
-		on_each_cpu(__cpa_flush_tlb, cpa, 1);
-	else
+	if (cpa->numpages > tlb_single_page_flush_ceiling || !list_empty(&pgtables))
   		flush_tlb_all();
+	else
+		on_each_cpu(__cpa_flush_tlb, cpa, 1);
+
+	list_for_each_entry_safe(page, tmp, &pgtables, lru) {
+		list_del(&page->lru);
+		__free_page(page);
+	}

The pagetable pages are freed here but ren't you leaking the leaf 4K pages
(except the first which is made large)?
Huh? There's no way to convert one 4k to large page. We convert all pages
in large page region to the large page.


Doh, sure and sorry , was short of coffee ...

--Mika