On Mon, Dec 05, 2022 at 04:50:46PM -0800, Ivan Babrou wrote: > And now I can see plenty of this: > > [ 108.156707][ T5175] socket pressure[2]: 4294673429 > [ 108.157050][ T5175] socket pressure[2]: 4294673429 > [ 108.157301][ T5175] socket pressure[2]: 4294673429 > [ 108.157581][ T5175] socket pressure[2]: 4294673429 > [ 108.157874][ T5175] socket pressure[2]: 4294673429 > [ 108.158254][ T5175] socket pressure[2]: 4294673429 > > I think the first result below is to blame: > > $ rg '.->socket_pressure' mm > mm/memcontrol.c > 5280: memcg->socket_pressure = jiffies; > 7198: memcg->socket_pressure = 0; > 7201: memcg->socket_pressure = 1; > 7211: memcg->socket_pressure = 0; > 7215: memcg->socket_pressure = 1; Hoo boy, that's a silly mistake indeed. Thanks for tracking it down. > While we set socket_pressure to either zero or one in > mem_cgroup_charge_skmem, it is still initialized to jiffies on memcg > creation. Zero seems like a more appropriate starting point. With that > change I see it working as expected with no TCP speed bumps. My > ebpf_exporter program also looks happy and reports zero clamps in my > brief testing. Excellent, now this behavior makes sense. > I also think we should downgrade socket_pressure from "unsigned long" > to "bool", as it only holds zero and one now. Sounds good to me! Attaching the updated patch below. If nobody has any objections, I'll add a proper changelog, reported-bys, sign-off etc and send it out. --- include/linux/memcontrol.h | 8 +++--- include/linux/vmpressure.h | 7 ++--- mm/memcontrol.c | 20 +++++++++---- mm/vmpressure.c | 58 ++++++-------------------------------- mm/vmscan.c | 15 +--------- 5 files changed, 30 insertions(+), 78 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e1644a24009c..ef1c388be5b3 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -283,11 +283,11 @@ struct mem_cgroup { atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; atomic_long_t memory_events_local[MEMCG_NR_MEMORY_EVENTS]; - unsigned long socket_pressure; + /* Socket memory allocations have failed */ + bool socket_pressure; /* Legacy tcp memory accounting */ bool tcpmem_active; - int tcpmem_pressure; #ifdef CONFIG_MEMCG_KMEM int kmemcg_id; @@ -1701,10 +1701,10 @@ void mem_cgroup_sk_alloc(struct sock *sk); void mem_cgroup_sk_free(struct sock *sk); static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure) + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->socket_pressure) return true; do { - if (time_before(jiffies, READ_ONCE(memcg->socket_pressure))) + if (memcg->socket_pressure) return true; } while ((memcg = parent_mem_cgroup(memcg))); return false; diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 6a2f51ebbfd3..20d93de37a17 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -11,9 +11,6 @@ #include <linux/eventfd.h> struct vmpressure { - unsigned long scanned; - unsigned long reclaimed; - unsigned long tree_scanned; unsigned long tree_reclaimed; /* The lock is used to keep the scanned/reclaimed above in sync. */ @@ -30,7 +27,7 @@ struct vmpressure { struct mem_cgroup; #ifdef CONFIG_MEMCG -extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, +extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, unsigned long scanned, unsigned long reclaimed); extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio); @@ -44,7 +41,7 @@ extern int vmpressure_register_event(struct mem_cgroup *memcg, extern void vmpressure_unregister_event(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd); #else -static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, +static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, unsigned long scanned, unsigned long reclaimed) {} static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) {} diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2d8549ae1b30..0d4b9dbe775a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5277,7 +5277,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void) vmpressure_init(&memcg->vmpressure); INIT_LIST_HEAD(&memcg->event_list); spin_lock_init(&memcg->event_list_lock); - memcg->socket_pressure = jiffies; #ifdef CONFIG_MEMCG_KMEM memcg->kmemcg_id = -1; INIT_LIST_HEAD(&memcg->objcg_list); @@ -7195,10 +7194,10 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, struct page_counter *fail; if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) { - memcg->tcpmem_pressure = 0; + memcg->socket_pressure = false; return true; } - memcg->tcpmem_pressure = 1; + memcg->socket_pressure = true; if (gfp_mask & __GFP_NOFAIL) { page_counter_charge(&memcg->tcpmem, nr_pages); return true; @@ -7206,12 +7205,21 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages, return false; } - if (try_charge(memcg, gfp_mask, nr_pages) == 0) { - mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); - return true; + if (try_charge(memcg, gfp_mask & ~__GFP_NOFAIL, nr_pages) == 0) { + memcg->socket_pressure = false; + goto success; + } + memcg->socket_pressure = true; + if (gfp_mask & __GFP_NOFAIL) { + try_charge(memcg, gfp_mask, nr_pages); + goto success; } return false; + +success: + mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); + return true; } /** diff --git a/mm/vmpressure.c b/mm/vmpressure.c index b52644771cc4..4cec90711cf4 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -219,7 +219,6 @@ static void vmpressure_work_fn(struct work_struct *work) * vmpressure() - Account memory pressure through scanned/reclaimed ratio * @gfp: reclaimer's gfp mask * @memcg: cgroup memory controller handle - * @tree: legacy subtree mode * @scanned: number of pages scanned * @reclaimed: number of pages reclaimed * @@ -227,16 +226,9 @@ static void vmpressure_work_fn(struct work_struct *work) * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw * pressure index is then further refined and averaged over time. * - * If @tree is set, vmpressure is in traditional userspace reporting - * mode: @memcg is considered the pressure root and userspace is - * notified of the entire subtree's reclaim efficiency. - * - * If @tree is not set, reclaim efficiency is recorded for @memcg, and - * only in-kernel users are notified. - * * This function does not return any value. */ -void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, +void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, unsigned long scanned, unsigned long reclaimed) { struct vmpressure *vmpr; @@ -271,46 +263,14 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree, if (!scanned) return; - if (tree) { - spin_lock(&vmpr->sr_lock); - scanned = vmpr->tree_scanned += scanned; - vmpr->tree_reclaimed += reclaimed; - spin_unlock(&vmpr->sr_lock); - - if (scanned < vmpressure_win) - return; - schedule_work(&vmpr->work); - } else { - enum vmpressure_levels level; - - /* For now, no users for root-level efficiency */ - if (!memcg || mem_cgroup_is_root(memcg)) - return; - - spin_lock(&vmpr->sr_lock); - scanned = vmpr->scanned += scanned; - reclaimed = vmpr->reclaimed += reclaimed; - if (scanned < vmpressure_win) { - spin_unlock(&vmpr->sr_lock); - return; - } - vmpr->scanned = vmpr->reclaimed = 0; - spin_unlock(&vmpr->sr_lock); + spin_lock(&vmpr->sr_lock); + scanned = vmpr->tree_scanned += scanned; + vmpr->tree_reclaimed += reclaimed; + spin_unlock(&vmpr->sr_lock); - level = vmpressure_calc_level(scanned, reclaimed); - - if (level > VMPRESSURE_LOW) { - /* - * Let the socket buffer allocator know that - * we are having trouble reclaiming LRU pages. - * - * For hysteresis keep the pressure state - * asserted for a second in which subsequent - * pressure events can occur. - */ - WRITE_ONCE(memcg->socket_pressure, jiffies + HZ); - } - } + if (scanned < vmpressure_win) + return; + schedule_work(&vmpr->work); } /** @@ -340,7 +300,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) * to the vmpressure() basically means that we signal 'critical' * level. */ - vmpressure(gfp, memcg, true, vmpressure_win, 0); + vmpressure(gfp, memcg, vmpressure_win, 0); } #define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2) diff --git a/mm/vmscan.c b/mm/vmscan.c index 04d8b88e5216..d348366d58d4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6035,8 +6035,6 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) memcg = mem_cgroup_iter(target_memcg, NULL, NULL); do { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); - unsigned long reclaimed; - unsigned long scanned; /* * This loop can become CPU-bound when target memcgs @@ -6068,20 +6066,9 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) memcg_memory_event(memcg, MEMCG_LOW); } - reclaimed = sc->nr_reclaimed; - scanned = sc->nr_scanned; - shrink_lruvec(lruvec, sc); - shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); - - /* Record the group's reclaim efficiency */ - if (!sc->proactive) - vmpressure(sc->gfp_mask, memcg, false, - sc->nr_scanned - scanned, - sc->nr_reclaimed - reclaimed); - } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); } @@ -6111,7 +6098,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) /* Record the subtree's reclaim efficiency */ if (!sc->proactive) - vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true, + vmpressure(sc->gfp_mask, sc->target_mem_cgroup, sc->nr_scanned - nr_scanned, sc->nr_reclaimed - nr_reclaimed); -- 2.38.1