Hi Brendan, On Tue, May 21, 2024 at 8:57 PM Brendan Jackman <jackmanb@xxxxxxxxxx> wrote: > > These fields are written by memory hotplug under mem_hotplug_lock but > read without any lock. It seems like reader code is robust against the > value being stale or "from the future", but we also need to account > for: > > 1. Load/store tearing (according to Linus[1], this really happens, > even when everything is aligned as you would hope). > > 2. Invented loads[2] - the compiler can spill and re-read these fields > ([2] calls this "invented loads") and assume that they have not > changed. > > Note we don't need READ_ONCE in paths that have the mem_hotplug_lock > for write, but we still need WRITE_ONCE to prevent store-tearing. > > [1] https://lore.kernel.org/all/CAHk-=wj2t+GK+DGQ7Xy6U7zMf72e7Jkxn4_-kGyfH3WFEoH+YQ@xxxxxxxxxxxxxx/T/#u > As discovered via the original big-bad article[2] > [2] https://lwn.net/Articles/793253/ > > Signed-off-by: Brendan Jackman <jackmanb@xxxxxxxxxx> > --- > include/linux/mmzone.h | 14 ++++++++++---- > mm/compaction.c | 2 +- > mm/memory_hotplug.c | 20 ++++++++++++-------- > mm/mm_init.c | 2 +- > mm/page_alloc.c | 2 +- > mm/show_mem.c | 8 ++++---- > mm/vmstat.c | 4 ++-- > 7 files changed, 31 insertions(+), 21 deletions(-) > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h > index 194ef7fed9d6..bdb3be76d10c 100644 > --- a/include/linux/mmzone.h > +++ b/include/linux/mmzone.h > @@ -1018,11 +1018,13 @@ static inline unsigned long zone_cma_pages(struct zone *zone) > #endif > } > > +/* This is unstable unless you hold mem_hotplug_lock. */ > static inline unsigned long zone_end_pfn(const struct zone *zone) > { > - return zone->zone_start_pfn + zone->spanned_pages; > + return zone->zone_start_pfn + READ_ONCE(zone->spanned_pages); > } > > +/* This is unstable unless you hold mem_hotplug_lock. */ > static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) > { > return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); > @@ -1033,9 +1035,10 @@ static inline bool zone_is_initialized(struct zone *zone) > return zone->initialized; > } > > +/* This is unstable unless you hold mem_hotplug_lock. */ > static inline bool zone_is_empty(struct zone *zone) > { > - return zone->spanned_pages == 0; > + return READ_ONCE(zone->spanned_pages) == 0; > } > > #ifndef BUILD_VDSO32_64 > @@ -1485,10 +1488,13 @@ static inline bool managed_zone(struct zone *zone) > return zone_managed_pages(zone); > } > > -/* Returns true if a zone has memory */ > +/* > + * Returns true if a zone has memory. > + * This is unstable unless you old mem_hotplug_lock. > + */ > static inline bool populated_zone(struct zone *zone) > { > - return zone->present_pages; > + return READ_ONCE(zone->present_pages); > } > > #ifdef CONFIG_NUMA > diff --git a/mm/compaction.c b/mm/compaction.c > index e731d45befc7..b8066d1fdcf5 100644 > --- a/mm/compaction.c > +++ b/mm/compaction.c > @@ -2239,7 +2239,7 @@ static unsigned int fragmentation_score_zone_weighted(struct zone *zone) > { > unsigned long score; > > - score = zone->present_pages * fragmentation_score_zone(zone); > + score = READ_ONCE(zone->present_pages) * fragmentation_score_zone(zone); > return div64_ul(score, zone->zone_pgdat->node_present_pages + 1); > } > > diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c > index 431b1f6753c0..71b5e3d314a2 100644 > --- a/mm/memory_hotplug.c > +++ b/mm/memory_hotplug.c > @@ -463,6 +463,8 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, > int nid = zone_to_nid(zone); > > if (zone->zone_start_pfn == start_pfn) { > + unsigned long old_end_pfn = zone_end_pfn(zone); > + > /* > * If the section is smallest section in the zone, it need > * shrink zone->zone_start_pfn and zone->zone_spanned_pages. > @@ -470,13 +472,13 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, > * for shrinking zone. > */ > pfn = find_smallest_section_pfn(nid, zone, end_pfn, > - zone_end_pfn(zone)); > + old_end_pfn); > if (pfn) { > - zone->spanned_pages = zone_end_pfn(zone) - pfn; > + WRITE_ONCE(zone->spanned_pages, old_end_pfn - pfn); > zone->zone_start_pfn = pfn; > } else { > zone->zone_start_pfn = 0; > - zone->spanned_pages = 0; > + WRITE_ONCE(zone->spanned_pages, 0); > } > } else if (zone_end_pfn(zone) == end_pfn) { > /* > @@ -488,10 +490,11 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, > pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn, > start_pfn); > if (pfn) > - zone->spanned_pages = pfn - zone->zone_start_pfn + 1; > + WRITE_ONCE(zone->spanned_pages, > + pfn - zone->zone_start_pfn + 1); > else { > zone->zone_start_pfn = 0; > - zone->spanned_pages = 0; > + WRITE_ONCE(zone->spanned_pages, 0); > } > } > } > @@ -710,7 +713,8 @@ static void __meminit resize_zone_range(struct zone *zone, unsigned long start_p > if (zone_is_empty(zone) || start_pfn < zone->zone_start_pfn) > zone->zone_start_pfn = start_pfn; > > - zone->spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn; > + WRITE_ONCE(zone->spanned_pages, > + max(start_pfn + nr_pages, old_end_pfn) - zone->zone_start_pfn); > } > > static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned long start_pfn, > @@ -795,7 +799,7 @@ static void auto_movable_stats_account_zone(struct auto_movable_stats *stats, > struct zone *zone) > { > if (zone_idx(zone) == ZONE_MOVABLE) { > - stats->movable_pages += zone->present_pages; > + stats->movable_pages += READ_ONCE(zone->present_pages); > } else { > stats->kernel_early_pages += zone->present_early_pages; > #ifdef CONFIG_CMA > @@ -1077,7 +1081,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group, > */ > if (early_section(__pfn_to_section(page_to_pfn(page)))) > zone->present_early_pages += nr_pages; > - zone->present_pages += nr_pages; > + WRITE_ONCE(zone->present_pages, zone->present_pages + nr_pages); I'm not sure that using the WRITE_ONCE() wrapper would prevent load tearing on 'zone->present_pages', but it's probably just me overthinking it :) Thanks, Lance > zone->zone_pgdat->node_present_pages += nr_pages; > > if (group && movable) > diff --git a/mm/mm_init.c b/mm/mm_init.c > index c725618aeb58..ec66f2eadb95 100644 > --- a/mm/mm_init.c > +++ b/mm/mm_init.c > @@ -1540,7 +1540,7 @@ void __ref free_area_init_core_hotplug(struct pglist_data *pgdat) > for (z = 0; z < MAX_NR_ZONES; z++) { > struct zone *zone = pgdat->node_zones + z; > > - zone->present_pages = 0; > + WRITE_ONCE(zone->present_pages, 0); > zone_init_internals(zone, z, nid, 0); > } > } > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index 5116a2b9ea6e..1eb9000ec7d7 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -5728,7 +5728,7 @@ __meminit void zone_pcp_init(struct zone *zone) > > if (populated_zone(zone)) > pr_debug(" %s zone: %lu pages, LIFO batch:%u\n", zone->name, > - zone->present_pages, zone_batchsize(zone)); > + READ_ONCE(zone->present_pages), zone_batchsize(zone)); > } > > void adjust_managed_page_count(struct page *page, long count) > diff --git a/mm/show_mem.c b/mm/show_mem.c > index bdb439551eef..667680a6107b 100644 > --- a/mm/show_mem.c > +++ b/mm/show_mem.c > @@ -337,7 +337,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z > K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)), > K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)), > K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)), > - K(zone->present_pages), > + K(READ_ONCE(zone->present_pages)), > K(zone_managed_pages(zone)), > K(zone_page_state(zone, NR_MLOCK)), > K(zone_page_state(zone, NR_BOUNCE)), > @@ -407,11 +407,11 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) > > for_each_populated_zone(zone) { > > - total += zone->present_pages; > - reserved += zone->present_pages - zone_managed_pages(zone); > + total += READ_ONCE(zone->present_pages); > + reserved += READ_ONCE(zone->present_pages) - zone_managed_pages(zone); > > if (is_highmem(zone)) > - highmem += zone->present_pages; > + highmem += READ_ONCE(zone->present_pages); > } > > printk("%lu pages RAM\n", total); > diff --git a/mm/vmstat.c b/mm/vmstat.c > index 8507c497218b..5a9c4b5768e5 100644 > --- a/mm/vmstat.c > +++ b/mm/vmstat.c > @@ -1708,8 +1708,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, > min_wmark_pages(zone), > low_wmark_pages(zone), > high_wmark_pages(zone), > - zone->spanned_pages, > - zone->present_pages, > + READ_ONCE(zone->spanned_pages), > + READ_ONCE(zone->present_pages), > zone_managed_pages(zone), > zone_cma_pages(zone)); > > > -- > 2.45.0.rc1.225.g2a3ae87e7f-goog > >