The per-CPU counter are modified with the non-atomic modifier. The consistency is ensure by disabling interrupts for the update. This breaks on PREEMPT_RT because some sections additionally acquire a spinlock_t lock (which becomes sleeping and must not be acquired with disabled interrupts). Another problem is that mem_cgroup_swapout() expects to be invoked with disabled interrupts because the caller has to acquire a spinlock_t which is acquired with disabled interrupts. Since spinlock_t never disables interrupts on PREEMPT_RT the interrupts are never disabled at this point. The code is never called from in_irq() context on PREEMPT_RT therefore disabling preemption during the update is sufficient on PREEMPT_RT. The sections with disabled preemption must exclude memcg_check_events() so that spinlock_t locks can still be acquired (for instance in eventfd_signal()). Don't disable interrupts during updates of the per-CPU variables, instead use shorter sections which disable preemption. Signed-off-by: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx> --- mm/memcontrol.c | 74 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 60 insertions(+), 14 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2ed5f2a0879d3..d2687d5ed544b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -671,8 +671,14 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) if (mem_cgroup_disabled()) return; + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + __this_cpu_add(memcg->vmstats_percpu->state[idx], val); memcg_rstat_updated(memcg); + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } /* idx can be of type enum memcg_stat_item or node_stat_item. */ @@ -699,6 +705,9 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); memcg = pn->memcg; + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + /* Update memcg */ __this_cpu_add(memcg->vmstats_percpu->state[idx], val); @@ -706,6 +715,9 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); memcg_rstat_updated(memcg); + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } /** @@ -788,8 +800,13 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, if (mem_cgroup_disabled()) return; + if (IS_ENABLED(PREEMPT_RT)) + preempt_disable(); + __this_cpu_add(memcg->vmstats_percpu->events[idx], count); memcg_rstat_updated(memcg); + if (IS_ENABLED(PREEMPT_RT)) + preempt_enable(); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) @@ -810,6 +827,9 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, int nr_pages) { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) __count_memcg_events(memcg, PGPGIN, 1); @@ -819,12 +839,19 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, } __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages); + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); } static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, enum mem_cgroup_events_target target) { unsigned long val, next; + bool ret = false; + + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events); next = __this_cpu_read(memcg->vmstats_percpu->targets[target]); @@ -841,9 +868,11 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, break; } __this_cpu_write(memcg->vmstats_percpu->targets[target], next); - return true; + ret = true; } - return false; + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); + return ret; } /* @@ -5645,12 +5674,14 @@ static int mem_cgroup_move_account(struct page *page, ret = 0; nid = folio_nid(folio); - local_irq_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); mem_cgroup_charge_statistics(to, nr_pages); memcg_check_events(to, nid); mem_cgroup_charge_statistics(from, -nr_pages); memcg_check_events(from, nid); - local_irq_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); out_unlock: folio_unlock(folio); out: @@ -6670,10 +6701,12 @@ static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg, css_get(&memcg->css); commit_charge(folio, memcg); - local_irq_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); mem_cgroup_charge_statistics(memcg, nr_pages); memcg_check_events(memcg, folio_nid(folio)); - local_irq_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); out: return ret; } @@ -6785,11 +6818,20 @@ static void uncharge_batch(const struct uncharge_gather *ug) memcg_oom_recover(ug->memcg); } - local_irq_save(flags); - __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); - __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); - memcg_check_events(ug->memcg, ug->nid); - local_irq_restore(flags); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + local_irq_save(flags); + __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); + __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); + memcg_check_events(ug->memcg, ug->nid); + local_irq_restore(flags); + } else { + preempt_disable(); + __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); + __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); + preempt_enable(); + + memcg_check_events(ug->memcg, ug->nid); + } /* drop reference from uncharge_folio */ css_put(&ug->memcg->css); @@ -6930,10 +6972,12 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new) css_get(&memcg->css); commit_charge(new, memcg); - local_irq_save(flags); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_save(flags); mem_cgroup_charge_statistics(memcg, nr_pages); memcg_check_events(memcg, folio_nid(new)); - local_irq_restore(flags); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_restore(flags); } DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); @@ -7157,8 +7201,10 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * i_pages lock which is taken with interrupts-off. It is * important here to have the interrupts disabled because it is the * only synchronisation we have for updating the per-CPU variables. + * On PREEMPT_RT interrupts are never disabled and the updates to per-CPU + * variables are synchronised by keeping preemption disabled. */ - VM_BUG_ON(!irqs_disabled()); + VM_BUG_ON(!IS_ENABLED(CONFIG_PREEMPT_RT) && !irqs_disabled()); mem_cgroup_charge_statistics(memcg, -nr_entries); memcg_check_events(memcg, page_to_nid(page)); -- 2.34.1