This is a Proof-of-concept code for balancing the node zones occupancy whose imbalance may be caused by the memory hotplug. Signed-off-by: Charan Teja Reddy <charante@xxxxxxxxxxxxxx> --- include/linux/migrate.h | 8 +- include/linux/mm.h | 3 + include/linux/mmzone.h | 2 + kernel/sysctl.c | 11 ++ mm/compaction.c | 4 +- mm/memory_hotplug.c | 265 ++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 290 insertions(+), 3 deletions(-) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 4594838..b7dc259 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -53,6 +53,8 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page); extern int migrate_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page, int extra_count); +extern void split_map_pages(struct list_head *list); +extern unsigned long release_freepages(struct list_head *freelist); #else static inline void putback_movable_pages(struct list_head *l) {} @@ -81,7 +83,11 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, { return -ENOSYS; } - +static inline void split_map_pages(struct list_head *list) { } +static inline unsigned long release_freepages(struct list_head *freelist) +{ + return 0; +} #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_COMPACTION diff --git a/include/linux/mm.h b/include/linux/mm.h index ecdf8a8..1014139 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2465,6 +2465,9 @@ extern int watermark_boost_factor; extern int watermark_scale_factor; extern bool arch_has_descending_max_zone_pfns(void); +/* memory_hotplug.c */ +extern int balance_node_occupancy_pages; + /* nommu.c */ extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b593316..ce417c3 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -977,6 +977,8 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int numa_zonelist_order_handler(struct ctl_table *, int, void *, size_t *, loff_t *); +extern int sysctl_balance_node_occupancy_handler(struct ctl_table *tbl, + int write, void *buf, size_t *len, loff_t *pos); extern int percpu_pagelist_fraction; extern char numa_zonelist_order[]; #define NUMA_ZONELIST_ORDER_LEN 16 diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c9fbdd8..4b95a90 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3140,6 +3140,17 @@ static struct ctl_table vm_table[] = { .extra2 = SYSCTL_ONE, }, #endif +#ifdef CONFIG_MEMORY_HOTPLUG + { + .procname = "balance_node_occupancy_pages", + .data = &balance_node_occupancy_pages, + .maxlen = sizeof(balance_node_occupancy_pages), + .mode = 0200, + .proc_handler = sysctl_balance_node_occupancy_handler, + .extra1 = SYSCTL_ZERO, + }, + +#endif { } }; diff --git a/mm/compaction.c b/mm/compaction.c index 190ccda..da3c015 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -68,7 +68,7 @@ static const unsigned int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 500; #define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT) #endif -static unsigned long release_freepages(struct list_head *freelist) +unsigned long release_freepages(struct list_head *freelist) { struct page *page, *next; unsigned long high_pfn = 0; @@ -84,7 +84,7 @@ static unsigned long release_freepages(struct list_head *freelist) return high_pfn; } -static void split_map_pages(struct list_head *list) +void split_map_pages(struct list_head *list) { unsigned int i, order, nr_pages; struct page *page, *next; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f9d57b9..2780c91 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -97,6 +97,271 @@ void mem_hotplug_done(void) u64 max_mem_size = U64_MAX; +int balance_node_occupancy_pages; +static atomic_t target_migrate_pages = ATOMIC_INIT(0); + +struct movable_zone_fill_control { + struct list_head freepages; + unsigned long start_pfn; + unsigned long end_pfn; + unsigned long nr_migrate_pages; + unsigned long nr_free_pages; + unsigned long limit; + int target; + struct zone *zone; +}; + +static void fill_movable_zone_fn(struct work_struct *work); +static DECLARE_WORK(fill_movable_zone_work, fill_movable_zone_fn); +static DEFINE_MUTEX(page_migrate_lock); + +static inline void reset_page_order(struct page *page) +{ + __ClearPageBuddy(page); + set_page_private(page, 0); +} + +static int isolate_free_page(struct page *page, unsigned int order) +{ + struct zone *zone; + + zone = page_zone(page); + list_del(&page->lru); + zone->free_area[order].nr_free--; + reset_page_order(page); + + return 1UL << order; +} + +static void isolate_free_pages(struct movable_zone_fill_control *fc) +{ + struct page *page; + unsigned long flags; + unsigned int order; + unsigned long start_pfn = fc->start_pfn; + unsigned long end_pfn = fc->end_pfn; + + spin_lock_irqsave(&fc->zone->lock, flags); + for (; start_pfn < end_pfn; start_pfn++) { + unsigned long isolated; + + if (!pfn_valid(start_pfn)) + continue; + + page = pfn_to_page(start_pfn); + if (!page) + continue; + + if (PageCompound(page)) { + struct page *head = compound_head(page); + int skip; + + skip = (1 << compound_order(head)) - (page - head); + start_pfn += skip - 1; + continue; + } + + if (!PageBuddy(page)) + continue; + + order = page_private(page); + isolated = isolate_free_page(page, order); + set_page_private(page, order); + list_add_tail(&page->lru, &fc->freepages); + fc->nr_free_pages += isolated; + __mod_zone_page_state(fc->zone, NR_FREE_PAGES, -isolated); + start_pfn += isolated - 1; + + /* + * Make sure that the zone->lock is not held for long by + * returning once we have SWAP_CLUSTER_MAX pages in the + * free list for migration. + */ + if (fc->nr_free_pages >= SWAP_CLUSTER_MAX) + break; + } + fc->start_pfn = start_pfn + 1; + spin_unlock_irqrestore(&fc->zone->lock, flags); + + split_map_pages(&fc->freepages); +} + +static struct page *movable_page_alloc(struct page *page, unsigned long data) +{ + struct movable_zone_fill_control *fc; + struct page *freepage; + + fc = (struct movable_zone_fill_control *)data; + if (list_empty(&fc->freepages)) { + isolate_free_pages(fc); + if (list_empty(&fc->freepages)) + return NULL; + } + + freepage = list_entry(fc->freepages.next, struct page, lru); + list_del(&freepage->lru); + fc->nr_free_pages--; + + return freepage; +} + +static void movable_page_free(struct page *page, unsigned long data) +{ + struct movable_zone_fill_control *fc; + + fc = (struct movable_zone_fill_control *)data; + list_add(&page->lru, &fc->freepages); + fc->nr_free_pages++; +} + +static unsigned long get_anon_movable_pages( + struct movable_zone_fill_control *fc, + unsigned long start_pfn, + unsigned long end_pfn, struct list_head *list) +{ + int found = 0, pfn, ret; + int limit = min_t(int, fc->target, (int)pageblock_nr_pages); + + fc->nr_migrate_pages = 0; + for (pfn = start_pfn; pfn < end_pfn && found < limit; ++pfn) { + struct page *page = pfn_to_page(pfn); + + if (!pfn_valid(pfn)) + continue; + + if (PageCompound(page)) { + struct page *head = compound_head(page); + int skip; + + skip = (1 << compound_order(head)) - (page - head); + pfn += skip - 1; + continue; + } + + if (PageBuddy(page)) { + unsigned long freepage_order; + + freepage_order = READ_ONCE(page_private(page)); + if (freepage_order > 0 && freepage_order < MAX_ORDER) + pfn += (1 << page_private(page)) - 1; + continue; + } + + if (!PageLRU(page) || !PageAnon(page)) + continue; + + if (!get_page_unless_zero(page)) + continue; + + found++; + ret = isolate_lru_page(page); + if (!ret) { + list_add_tail(&page->lru, list); + inc_node_page_state(page, NR_ISOLATED_ANON + + page_is_file_lru(page)); + ++fc->nr_migrate_pages; + } + + put_page(page); + } + + return pfn; +} + +static void prepare_fc(struct movable_zone_fill_control *fc) +{ + struct zone *zone; + + zone = &(NODE_DATA(0)->node_zones[ZONE_MOVABLE]); + fc->zone = zone; + fc->start_pfn = zone->zone_start_pfn; + fc->end_pfn = zone_end_pfn(zone); + fc->limit = atomic64_read(&zone->managed_pages); + INIT_LIST_HEAD(&fc->freepages); +} + +#define MIGRATE_TIMEOUT_SEC (20) +static void fill_movable_zone_fn(struct work_struct *work) +{ + unsigned long start_pfn, end_pfn; + unsigned long movable_highmark; + struct zone *normal_zone = &(NODE_DATA(0)->node_zones[ZONE_NORMAL]); + struct zone *movable_zone = &(NODE_DATA(0)->node_zones[ZONE_MOVABLE]); + LIST_HEAD(source); + int ret, free; + struct movable_zone_fill_control fc = { {0} }; + unsigned long timeout = MIGRATE_TIMEOUT_SEC * HZ, expire; + + start_pfn = normal_zone->zone_start_pfn; + end_pfn = zone_end_pfn(normal_zone); + movable_highmark = high_wmark_pages(movable_zone); + + lru_add_drain_all(); + drain_all_pages(normal_zone); + if (!mutex_trylock(&page_migrate_lock)) + return; + prepare_fc(&fc); + if (!fc.limit) + goto out; + expire = jiffies + timeout; +restart: + fc.target = atomic_xchg(&target_migrate_pages, 0); + if (!fc.target) + goto out; +repeat: + cond_resched(); + if (time_after(jiffies, expire)) + goto out; + free = zone_page_state(movable_zone, NR_FREE_PAGES); + if (free - fc.target <= movable_highmark) + fc.target = free - movable_highmark; + if (fc.target <= 0) + goto out; + + start_pfn = get_anon_movable_pages(&fc, start_pfn, end_pfn, &source); + if (list_empty(&source) && start_pfn < end_pfn) + goto repeat; + + ret = migrate_pages(&source, movable_page_alloc, movable_page_free, + (unsigned long) &fc, + MIGRATE_ASYNC, MR_MEMORY_HOTPLUG); + if (ret) + putback_movable_pages(&source); + + fc.target -= fc.nr_migrate_pages; + if (ret == -ENOMEM || start_pfn >= end_pfn) + goto out; + else if (fc.target <= 0) + goto restart; + + goto repeat; +out: + mutex_unlock(&page_migrate_lock); + if (fc.nr_free_pages > 0) + release_freepages(&fc.freepages); +} + + + +int sysctl_balance_node_occupancy_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + if (write) { + atomic_add(balance_node_occupancy_pages, &target_migrate_pages); + + if (!work_pending(&fill_movable_zone_work)) + queue_work(system_unbound_wq, &fill_movable_zone_work); + } + + return 0; +} + /* add this memory to iomem resource */ static struct resource *register_memory_resource(u64 start, u64 size, const char *resource_name) -- QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum, hosted by The Linux Foundation