From: Alexander Duyck <alexander.h.duyck@xxxxxxxxxxxxxxx> Add support for "aerating" memory in a guest by pushing individual pages out. This patch is meant to add generic support for this by adding a common framework that can be used later by drivers such as virtio-balloon. Signed-off-by: Alexander Duyck <alexander.h.duyck@xxxxxxxxxxxxxxx> --- include/linux/memory_aeration.h | 54 +++++++ mm/Kconfig | 5 + mm/Makefile | 1 mm/aeration.c | 320 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 380 insertions(+) create mode 100644 include/linux/memory_aeration.h create mode 100644 mm/aeration.c diff --git a/include/linux/memory_aeration.h b/include/linux/memory_aeration.h new file mode 100644 index 000000000000..5ba0e634f240 --- /dev/null +++ b/include/linux/memory_aeration.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_MEMORY_AERATION_H +#define _LINUX_MEMORY_AERATION_H + +#include <linux/pageblock-flags.h> +#include <linux/jump_label.h> +#include <asm/pgtable_types.h> + +struct zone; + +#define AERATOR_MIN_ORDER pageblock_order + +struct aerator_dev_info { + unsigned long capacity; + struct list_head batch_reactor; + atomic_t refcnt; + void (*react)(struct aerator_dev_info *a_dev_info); +}; + +extern struct static_key aerator_notify_enabled; + +void aerator_cycle(void); +void __aerator_notify(struct zone *zone, int order); + +/** + * aerator_notify_free - Free page notification that will start page processing + * @page: Last page processed + * @zone: Pointer to current zone of last page processed + * @order: Order of last page added to zone + * + * This function is meant to act as a screener for __aerator_notify which + * will determine if a give zone has crossed over the high-water mark that + * will justify us beginning page treatment. If we have crossed that + * threshold then it will start the process of pulling some pages and + * placing them in the batch_reactor list for treatment. + */ +static inline void +aerator_notify_free(struct page *page, struct zone *zone, int order) +{ + if (!static_key_false(&aerator_notify_enabled)) + return; + + if (order < AERATOR_MIN_ORDER) + return; + + __aerator_notify(zone, order); +} + +void aerator_shutdown(void); +int aerator_startup(struct aerator_dev_info *sdev); + +#define AERATOR_ZONE_BITS (BITS_TO_LONGS(MAX_NR_ZONES) * BITS_PER_LONG) +#define AERATOR_HWM_BITS (AERATOR_ZONE_BITS * MAX_NUMNODES) +#endif /*_LINUX_MEMORY_AERATION_H */ diff --git a/mm/Kconfig b/mm/Kconfig index f0c76ba47695..34680214cefa 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -236,6 +236,11 @@ config COMPACTION linux-mm@xxxxxxxxx. # +# support for memory aeration +config AERATION + bool + +# # support for page migration # config MIGRATION diff --git a/mm/Makefile b/mm/Makefile index ac5e5ba78874..26c2fcd2b89d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -104,3 +104,4 @@ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_HMM) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o +obj-$(CONFIG_AERATION) += aeration.o diff --git a/mm/aeration.c b/mm/aeration.c new file mode 100644 index 000000000000..aaf8af8d822f --- /dev/null +++ b/mm/aeration.c @@ -0,0 +1,320 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/memory_aeration.h> +#include <linux/mmzone.h> +#include <linux/gfp.h> +#include <linux/export.h> +#include <linux/delay.h> +#include <linux/slab.h> + +static unsigned long *aerator_hwm; +static struct aerator_dev_info *a_dev_info; +struct static_key aerator_notify_enabled; + +void aerator_shutdown(void) +{ + static_key_slow_dec(&aerator_notify_enabled); + + while (atomic_read(&a_dev_info->refcnt)) + msleep(20); + + kfree(aerator_hwm); + aerator_hwm = NULL; + + a_dev_info = NULL; +} +EXPORT_SYMBOL_GPL(aerator_shutdown); + +int aerator_startup(struct aerator_dev_info *sdev) +{ + size_t size = BITS_TO_LONGS(AERATOR_HWM_BITS) * sizeof(unsigned long); + unsigned long *hwm; + + if (a_dev_info || aerator_hwm) + return -EBUSY; + + a_dev_info = sdev; + + atomic_set(&sdev->refcnt, 0); + + hwm = kzalloc(size, GFP_KERNEL); + if (!hwm) { + aerator_shutdown(); + return -ENOMEM; + } + + aerator_hwm = hwm; + + static_key_slow_inc(&aerator_notify_enabled); + + return 0; +} +EXPORT_SYMBOL_GPL(aerator_startup); + +static inline unsigned long *get_aerator_hwm(int nid) +{ + if (!aerator_hwm) + return NULL; + + return aerator_hwm + (BITS_TO_LONGS(MAX_NR_ZONES) * nid); +} + +static int __aerator_fill(struct zone *zone, unsigned int size) +{ + struct list_head *batch = &a_dev_info->batch_reactor; + unsigned long nr_raw = 0; + unsigned int len = 0; + unsigned int order; + + for (order = MAX_ORDER; order-- != AERATOR_MIN_ORDER;) { + struct free_area *area = &(zone->free_area[order]); + int mt = area->treatment_mt; + + /* + * If there are no untreated pages to pull + * then we might as well skip the area. + */ + while (area->nr_free_raw) { + unsigned int count = 0; + struct page *page; + + /* + * If we completed aeration we can let the current + * free list work on settling so that a batch of + * new raw pages can build. In the meantime move on + * to the next migratetype. + */ + if (++mt >= MIGRATE_TYPES) + mt = 0; + + /* + * Pull pages from free list until we have drained + * it or we have filled the batch reactor. + */ + while ((page = get_raw_pages(zone, order, mt))) { + list_add(&page->lru, batch); + + if (++count == (size - len)) + return size; + } + + /* + * If we pulled any pages from this migratetype then + * we must move on to a new free area as we cannot + * move the membrane until after we have decanted the + * pages currently being aerated. + */ + if (count) { + len += count; + break; + } + } + + /* + * Keep a running total of the raw packets we have left + * behind. We will use this to determine if we should + * clear the HWM flag. + */ + nr_raw += area->nr_free_raw; + } + + /* + * If there are no longer enough free pages to fully populate + * the aerator, then we can just shut it down for this zone. + */ + if (nr_raw < a_dev_info->capacity) { + unsigned long *hwm = get_aerator_hwm(zone_to_nid(zone)); + + clear_bit(zone_idx(zone), hwm); + atomic_dec(&a_dev_info->refcnt); + } + + return len; +} + +static unsigned int aerator_fill(int nid, int zid, int budget) +{ + pg_data_t *pgdat = NODE_DATA(nid); + struct zone *zone = &pgdat->node_zones[zid]; + unsigned long flags; + int len; + + spin_lock_irqsave(&zone->lock, flags); + + /* fill aerator with "raw" pages */ + len = __aerator_fill(zone, budget); + + spin_unlock_irqrestore(&zone->lock, flags); + + return len; +} + +static void aerator_fill_and_react(void) +{ + int budget = a_dev_info->capacity; + int nr; + + /* + * We should never be calling this function while there are already + * pages in the reactor being aerated. If we are called under such + * a circumstance report an error. + */ + BUG_ON(!list_empty(&a_dev_info->batch_reactor)); +retry: + /* + * We want to hold one additional reference against the number of + * active hints as we may clear the hint that originally brought us + * here. We will clear it after we have either vaporized the content + * of the pages, or if we discover all pages were stolen out from + * under us. + */ + atomic_inc(&a_dev_info->refcnt); + + for_each_set_bit(nr, aerator_hwm, AERATOR_HWM_BITS) { + int node_id = nr / AERATOR_ZONE_BITS; + int zone_id = nr % AERATOR_ZONE_BITS; + + budget -= aerator_fill(node_id, zone_id, budget); + if (!budget) + goto start_aerating; + } + + if (unlikely(list_empty(&a_dev_info->batch_reactor))) { + /* + * If we never generated any pages, and we were holding the + * only remaining reference to active hints then we can + * just let this go for now and go idle. + */ + if (atomic_dec_and_test(&a_dev_info->refcnt)) + return; + + /* + * There must be a bit populated somewhere, try going + * back through and finding it. + */ + goto retry; + } + +start_aerating: + a_dev_info->react(a_dev_info); +} + +void aerator_decant(void) +{ + struct list_head *list = &a_dev_info->batch_reactor; + struct page *page; + + /* + * This function should never be called on an empty list. If so it + * points to a bug as we should never be running the aerator when + * the list is empty. + */ + WARN_ON(list_empty(&a_dev_info->batch_reactor)); + + while ((page = list_first_entry_or_null(list, struct page, lru))) { + list_del(&page->lru); + + __SetPageTreated(page); + + free_treated_page(page); + } +} + +/** + * aerator_cycle - drain, fill, and start aerating another batch of pages + * + * This function is at the heart of the aerator. It should be called after + * the previous batch of pages has finished being processed by the aerator. + * It will drain the aerator, refill it, and start the next set of pages + * being processed. + */ +void aerator_cycle(void) +{ + aerator_decant(); + + /* + * Now that the pages have been flushed we can drop our reference to + * the active hints list. If there are no further hints that need to + * be processed we can simply go idle. + */ + if (atomic_dec_and_test(&a_dev_info->refcnt)) + return; + + aerator_fill_and_react(); +} +EXPORT_SYMBOL_GPL(aerator_cycle); + +static void __aerator_fill_and_react(struct zone *zone) +{ + /* + * We should never be calling this function while there are already + * pages in the list being aerated. If we are called under such a + * circumstance report an error. + */ + BUG_ON(!list_empty(&a_dev_info->batch_reactor)); + + /* + * We want to hold one additional reference against the number of + * active hints as we may clear the hint that originally brought us + * here. We will clear it after we have either vaporized the content + * of the pages, or if we discover all pages were stolen out from + * under us. + */ + atomic_inc(&a_dev_info->refcnt); + + __aerator_fill(zone, a_dev_info->capacity); + + if (unlikely(list_empty(&a_dev_info->batch_reactor))) { + /* + * If we never generated any pages, and we were holding the + * only remaining reference to active hints then we can just + * let this go for now and go idle. + */ + if (atomic_dec_and_test(&a_dev_info->refcnt)) + return; + + /* + * Another zone must have populated some raw pages that + * need to be processed. Release the zone lock and process + * that zone instead. + */ + spin_unlock(&zone->lock); + aerator_fill_and_react(); + } else { + /* Release the zone lock and begin the page aerator */ + spin_unlock(&zone->lock); + a_dev_info->react(a_dev_info); + } + + /* Reaquire lock so we can resume processing this zone */ + spin_lock(&zone->lock); +} + +void __aerator_notify(struct zone *zone, int order) +{ + int node_id = zone_to_nid(zone); + int zone_id = zone_idx(zone); + unsigned long *hwm; + + if (zone->free_area[order].nr_free_raw < (2 * a_dev_info->capacity)) + return; + + hwm = get_aerator_hwm(node_id); + + /* + * We an use separate test and set operations here as there + * is nothing else that can set or clear this bit while we are + * holding the zone lock. The advantage to doing it this way is + * that we don't have to dirty the cacheline unless we are + * changing the value. + */ + if (test_bit(zone_id, hwm)) + return; + set_bit(zone_id, hwm); + + if (atomic_fetch_inc(&a_dev_info->refcnt)) + return; + + __aerator_fill_and_react(zone); +} +EXPORT_SYMBOL_GPL(__aerator_notify); +