These fields are written by memory hotplug under mem_hotplug_lock but read without any lock. It seems like reader code is robust against the value being stale or "from the future", but we also need to account for: 1. Load/store tearing (according to Linus[1], this really happens, even when everything is aligned as you would hope). 2. Invented loads[2] - the compiler can spill and re-read these fields ([2] calls this "invented loads") and assume that they have not changed. Note we don't need READ_ONCE in paths that have the mem_hotplug_lock for write, but we still need WRITE_ONCE to prevent store-tearing. [1] https://lore.kernel.org/all/CAHk-=wj2t+GK+DGQ7Xy6U7zMf72e7Jkxn4_-kGyfH3WFEoH+YQ@xxxxxxxxxxxxxx/T/#u As discovered via the original big-bad article[2] [2] https://lwn.net/Articles/793253/ Signed-off-by: Brendan Jackman <jackmanb@xxxxxxxxxx> --- include/linux/mmzone.h | 14 ++++++++++---- mm/compaction.c | 2 +- mm/memory_hotplug.c | 20 ++++++++++++-------- mm/mm_init.c | 2 +- mm/page_alloc.c | 2 +- mm/show_mem.c | 8 ++++---- mm/vmstat.c | 4 ++-- 7 files changed, 31 insertions(+), 21 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 194ef7fed9d6..bdb3be76d10c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1018,11 +1018,13 @@ static inline unsigned long zone_cma_pages(struct zone *zone) #endif } +/* This is unstable unless you hold mem_hotplug_lock. */ static inline unsigned long zone_end_pfn(const struct zone *zone) { - return zone->zone_start_pfn + zone->spanned_pages; + return zone->zone_start_pfn + READ_ONCE(zone->spanned_pages); } +/* This is unstable unless you hold mem_hotplug_lock. */ static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) { return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); @@ -1033,9 +1035,10 @@ static inline bool zone_is_initialized(struct zone *zone) return zone->initialized; } +/* This is unstable unless you hold mem_hotplug_lock. */ static inline bool zone_is_empty(struct zone *zone) { - return zone->spanned_pages == 0; + return READ_ONCE(zone->spanned_pages) == 0; } #ifndef BUILD_VDSO32_64 @@ -1485,10 +1488,13 @@ static inline bool managed_zone(struct zone *zone) return zone_managed_pages(zone); } -/* Returns true if a zone has memory */ +/* + * Returns true if a zone has memory. + * This is unstable unless you old mem_hotplug_lock. + */ static inline bool populated_zone(struct zone *zone) { - return zone->present_pages; + return READ_ONCE(zone->present_pages); } #ifdef CONFIG_NUMA diff --git a/mm/compaction.c b/mm/compaction.c index e731d45befc7..b8066d1fdcf5 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2239,7 +2239,7 @@ static unsigned int fragmentation_score_zone_weighted(struct zone *zone) { unsigned long score; - score = zone->present_pages * fragmentation_score_zone(zone); + score = READ_ONCE(zone->present_pages) * fragmentation_score_zone(zone); return div64_ul(score, zone->zone_pgdat->node_present_pages + 1); } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 431b1f6753c0..71b5e3d314a2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -463,6 +463,8 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, int nid = zone_to_nid(zone); if (zone->zone_start_pfn == start_pfn) { + unsigned long old_end_pfn = zone_end_pfn(zone); + /* * If the section is smallest section in the zone, it need * shrink zone->zone_start_pfn and zone->zone_spanned_pages. @@ -470,13 +472,13 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, * for shrinking zone. */ pfn = find_smallest_section_pfn(nid, zone, end_pfn, - zone_end_pfn(zone)); + old_end_pfn); if (pfn) { - zone->spanned_pages = zone_end_pfn(zone) - pfn; + WRITE_ONCE(zone->spanned_pages, old_end_pfn - pfn); zone->zone_start_pfn = pfn; } else { zone->zone_start_pfn = 0; - zone->spanned_pages = 0; + WRITE_ONCE(zone->spanned_pages, 0); } } else if (zone_end_pfn(zone) == end_pfn) { /* @@ -488,10 +490,11 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn, start_pfn); if (pfn) - zone->spanned_pages = pfn - zone->zone_start_pfn + 1; + WRITE_ONCE(zone->spanned_pages, + pfn - zone->zone_start_pfn + 1); else { zone->zone_start_pfn = 0; - zone->spanned_pages = 0; + WRITE_ONCE(zone->spanned_pages, 0); } } } @@ -710,7 +713,8 @@ static void __meminit resize_zone_range(struct zone *zone, unsigned long start_p if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) zone->zone_start_pfn = start_pfn; - zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn; + WRITE_ONCE(zone->spanned_pages, + max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn); } static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, @@ -795,7 +799,7 @@ static void auto_movable_stats_account_zone(struct auto_movable_stats *stats, struct zone *zone) { if (zone_idx(zone) == ZONE_MOVABLE) { - stats->movable_pages += zone->present_pages; + stats->movable_pages += READ_ONCE(zone->present_pages); } else { stats->kernel_early_pages += zone->present_early_pages; #ifdef CONFIG_CMA @@ -1077,7 +1081,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group, */ if (early_section(__pfn_to_section(page_to_pfn(page)))) zone->present_early_pages += nr_pages; - zone->present_pages += nr_pages; + WRITE_ONCE(zone->present_pages, zone->present_pages + nr_pages); zone->zone_pgdat->node_present_pages += nr_pages; if (group && movable) diff --git a/mm/mm_init.c b/mm/mm_init.c index c725618aeb58..ec66f2eadb95 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1540,7 +1540,7 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat) for (z = 0; z < MAX_NR_ZONES; z++) { struct zone *zone = pgdat->node_zones + z; - zone->present_pages = 0; + WRITE_ONCE(zone->present_pages, 0); zone_init_internals(zone, z, nid, 0); } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5116a2b9ea6e..1eb9000ec7d7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5728,7 +5728,7 @@ __meminit void zone_pcp_init(struct zone *zone) if (populated_zone(zone)) pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name, - zone->present_pages, zone_batchsize(zone)); + READ_ONCE(zone->present_pages), zone_batchsize(zone)); } void adjust_managed_page_count(struct page *page, long count) diff --git a/mm/show_mem.c b/mm/show_mem.c index bdb439551eef..667680a6107b 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -337,7 +337,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), - K(zone->present_pages), + K(READ_ONCE(zone->present_pages)), K(zone_managed_pages(zone)), K(zone_page_state(zone, NR_MLOCK)), K(zone_page_state(zone, NR_BOUNCE)), @@ -407,11 +407,11 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) for_each_populated_zone(zone) { - total += zone->present_pages; - reserved += zone->present_pages - zone_managed_pages(zone); + total += READ_ONCE(zone->present_pages); + reserved += READ_ONCE(zone->present_pages) - zone_managed_pages(zone); if (is_highmem(zone)) - highmem += zone->present_pages; + highmem += READ_ONCE(zone->present_pages); } printk("%lu pages RAM\n", total); diff --git a/mm/vmstat.c b/mm/vmstat.c index 8507c497218b..5a9c4b5768e5 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1708,8 +1708,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), - zone->spanned_pages, - zone->present_pages, + READ_ONCE(zone->spanned_pages), + READ_ONCE(zone->present_pages), zone_managed_pages(zone), zone_cma_pages(zone)); -- 2.45.0.rc1.225.g2a3ae87e7f-goog