Some data exfiltration and return-oriented-programming attacks rely on the ability to infer the location of sensitive data objects. The kernel page allocator, especially early in system boot, has predictable first-in-first out behavior for physical pages. Pages are freed in physical address order when first onlined. Introduce shuffle_free_memory(), and its helper shuffle_zone(), to perform a Fisher-Yates shuffle of the page allocator 'free_area' lists when they are initially populated with free memory. The shuffling is done in terms of 'shuffle_page_order' sized free pages where the default shuffle_page_order is MAX_ORDER-1 i.e. 10, 4MB. The performance impact of the shuffling appears to be in the noise compared to other memory initialization work. Also the bulk of the work is done in the background as a part of deferred_init_memmap(). Cc: Michal Hocko <mhocko@xxxxxxxx> Cc: Kees Cook <keescook@xxxxxxxxxxxx> Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx> Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx> --- include/linux/list.h | 17 +++++ include/linux/mm.h | 2 + include/linux/mmzone.h | 4 + mm/bootmem.c | 9 ++- mm/nobootmem.c | 7 ++ mm/page_alloc.c | 172 ++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 207 insertions(+), 4 deletions(-) diff --git a/include/linux/list.h b/include/linux/list.h index de04cc5ed536..43f963328d7c 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -150,6 +150,23 @@ static inline void list_replace_init(struct list_head *old, INIT_LIST_HEAD(old); } +/** + * list_swap - replace entry1 with entry2 and re-add entry1 at entry2's position + * @entry1: the location to place entry2 + * @entry2: the location to place entry1 + */ +static inline void list_swap(struct list_head *entry1, + struct list_head *entry2) +{ + struct list_head *pos = entry2->prev; + + list_del(entry2); + list_replace(entry1, entry2); + if (pos == entry1) + pos = entry2; + list_add(entry1, pos); +} + /** * list_del_init - deletes entry from list and reinitialize it. * @entry: the element to delete from the list. diff --git a/include/linux/mm.h b/include/linux/mm.h index a61ebe8ad4ca..588f34e4390e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2040,6 +2040,8 @@ extern void adjust_managed_page_count(struct page *page, long count); extern void mem_init_print_info(const char *str); extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end); +extern void shuffle_free_memory(pg_data_t *pgdat, unsigned long start_pfn, + unsigned long end_pfn); /* Free the reserved page into the buddy system, so it gets managed. */ static inline void __free_reserved_page(struct page *page) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1e22d96734e0..8f8fc7dab5cb 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1277,6 +1277,10 @@ void sparse_init(void); #else #define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) +static inline int pfn_present(unsigned long pfn) +{ + return 1; +} #endif /* CONFIG_SPARSEMEM */ /* diff --git a/mm/bootmem.c b/mm/bootmem.c index 97db0e8e362b..7f5ff899c622 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -210,6 +210,7 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size) static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { struct page *page; + int nid = bdata - bootmem_node_data; unsigned long *map, start, end, pages, cur, count = 0; if (!bdata->node_bootmem_map) @@ -219,8 +220,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) start = bdata->node_min_pfn; end = bdata->node_low_pfn; - bdebug("nid=%td start=%lx end=%lx\n", - bdata - bootmem_node_data, start, end); + bdebug("nid=%d start=%lx end=%lx\n", nid, start, end); while (start < end) { unsigned long idx, vec; @@ -276,7 +276,10 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) __free_pages_bootmem(page++, cur++, 0); bdata->node_bootmem_map = NULL; - bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count); + shuffle_free_memory(NODE_DATA(nid), bdata->node_min_pfn, + bdata->node_low_pfn); + + bdebug("nid=%d released=%lx\n", nid, count); return count; } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 439af3b765a7..40b42434e805 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -131,6 +131,7 @@ static unsigned long __init free_low_memory_core_early(void) { unsigned long count = 0; phys_addr_t start, end; + pg_data_t *pgdat; u64 i; memblock_clear_hotplug(0, -1); @@ -144,8 +145,12 @@ static unsigned long __init free_low_memory_core_early(void) * low ram will be on Node1 */ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, - NULL) + NULL) { count += __free_memory_core(start, end); + for_each_online_pgdat(pgdat) + shuffle_free_memory(pgdat, PHYS_PFN(start), + PHYS_PFN(end)); + } return count; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 89d2a2ab3fe6..2fff9e69d8f3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -55,6 +55,7 @@ #include <trace/events/kmem.h> #include <trace/events/oom.h> #include <linux/prefetch.h> +#include <linux/random.h> #include <linux/mm_inline.h> #include <linux/migrate.h> #include <linux/hugetlb.h> @@ -72,6 +73,13 @@ #include <asm/div64.h> #include "internal.h" +/* + * page_alloc.shuffle_page_order gates which page orders are shuffled by + * shuffle_zone() during memory initialization. + */ +static int __read_mostly shuffle_page_order = MAX_ORDER-1; +module_param(shuffle_page_order, int, 0444); + /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ static DEFINE_MUTEX(pcp_batch_high_lock); #define MIN_PERCPU_PAGELIST_FRACTION (8) @@ -1035,6 +1043,168 @@ static __always_inline bool free_pages_prepare(struct page *page, return true; } +/* + * For two pages to be swapped in the shuffle, they must be free (on a + * 'free_area' lru), have the same order, and have the same migratetype. + */ +static struct page * __init shuffle_valid_page(unsigned long pfn, int order) +{ + struct page *page; + + /* + * Given we're dealing with randomly selected pfns in a zone we + * need to ask questions like... + */ + + /* ...is the pfn even in the memmap? */ + if (!pfn_valid_within(pfn)) + return NULL; + + /* ...is the pfn in a present section or a hole? */ + if (!pfn_present(pfn)) + return NULL; + + /* ...is the page free and currently on a free_area list? */ + page = pfn_to_page(pfn); + if (!PageBuddy(page)) + return NULL; + + /* + * ...is the page on the same list as the page we will + * shuffle it with? + */ + if (page_order(page) != order) + return NULL; + + return page; +} + +/* + * Fisher-Yates shuffle the freelist which prescribes iterating through + * an array, pfns in this case, and randomly swapping each entry with + * another in the span, end_pfn - start_pfn. + * + * To keep the implementation simple it does not attempt to correct for + * sources of bias in the distribution, like modulo bias or + * pseudo-random number generator bias. I.e. the expectation is that + * this shuffling raises the bar for attacks that exploit the + * predictability of page allocations, but need not be a perfect + * shuffle. + * + * Note that we don't use @z->zone_start_pfn and zone_end_pfn(@z) + * directly since the caller may be aware of holes in the zone and can + * improve the accuracy of the random pfn selection. + */ +#define SHUFFLE_RETRY 10 +static void __init shuffle_zone_order(struct zone *z, unsigned long start_pfn, + unsigned long end_pfn, const int order) +{ + unsigned long i, flags; + const int order_pages = 1 << order; + + if (start_pfn < z->zone_start_pfn) + start_pfn = z->zone_start_pfn; + if (end_pfn > zone_end_pfn(z)) + end_pfn = zone_end_pfn(z); + + /* probably means that start/end were outside the zone */ + if (end_pfn <= start_pfn) + return; + spin_lock_irqsave(&z->lock, flags); + start_pfn = ALIGN(start_pfn, order_pages); + for (i = start_pfn; i < end_pfn; i += order_pages) { + unsigned long j; + int migratetype, retry; + struct page *page_i, *page_j; + + /* + * We expect page_i, in the sub-range of a zone being + * added (@start_pfn to @end_pfn), to more likely be + * valid compared to page_j randomly selected in the + * span @zone_start_pfn to @spanned_pages. + */ + page_i = shuffle_valid_page(i, order); + if (!page_i) + continue; + + for (retry = 0; retry < SHUFFLE_RETRY; retry++) { + /* + * Pick a random order aligned page from the + * start of the zone. Use the *whole* zone here + * so that if it is freed in tiny pieces that we + * randomize in the whole zone, not just within + * those fragments. + * + * Since page_j comes from a potentially sparse + * address range we want to try a bit harder to + * find a shuffle point for page_i. + */ + j = z->zone_start_pfn + + ALIGN_DOWN(get_random_long() % z->spanned_pages, + order_pages); + page_j = shuffle_valid_page(j, order); + if (page_j && page_j != page_i) + break; + } + if (retry >= SHUFFLE_RETRY) { + pr_debug("%s: failed to swap %#lx\n", __func__, i); + continue; + } + + /* + * Each migratetype corresponds to its own list, make + * sure the types match otherwise we're moving pages to + * lists where they do not belong. + */ + migratetype = get_pageblock_migratetype(page_i); + if (get_pageblock_migratetype(page_j) != migratetype) { + pr_debug("%s: migratetype mismatch %#lx\n", __func__, i); + continue; + } + + list_swap(&page_i->lru, &page_j->lru); + + pr_debug("%s: swap: %#lx -> %#lx\n", __func__, i, j); + + /* take it easy on the zone lock */ + if ((i % (100 * order_pages)) == 0) { + spin_unlock_irqrestore(&z->lock, flags); + cond_resched(); + spin_lock_irqsave(&z->lock, flags); + } + } + spin_unlock_irqrestore(&z->lock, flags); +} + +static void __init shuffle_zone(struct zone *z, unsigned long start_pfn, + unsigned long end_pfn) +{ + int i; + + /* shuffle all the orders at the specified order and higher */ + for (i = shuffle_page_order; i < MAX_ORDER; i++) + shuffle_zone_order(z, start_pfn, end_pfn, i); +} + +/** + * shuffle_free_memory - reduce the predictability of the page allocator + * @pgdat: node page data + * @start_pfn: Limit the shuffle to the greater of this value or zone start + * @end_pfn: Limit the shuffle to the less of this value or zone end + * + * While shuffle_zone() attempts to avoid holes with pfn_valid() and + * pfn_present() they can not report sub-section sized holes. @start_pfn + * and @end_pfn limit the shuffle to the exact memory pages being freed. + */ +void __init shuffle_free_memory(pg_data_t *pgdat, unsigned long start_pfn, + unsigned long end_pfn) +{ + struct zone *z; + + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) + shuffle_zone(z, start_pfn, end_pfn); +} + #ifdef CONFIG_DEBUG_VM static inline bool free_pcp_prepare(struct page *page) { @@ -1583,6 +1753,8 @@ static int __init deferred_init_memmap(void *data) } pgdat_resize_unlock(pgdat, &flags); + shuffle_zone(zone, first_init_pfn, zone_end_pfn(zone)); + /* Sanity check that the next zone really is unpopulated */ WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));