Re: [PATCH 01/11] cgroup: move rstat pointers into struct of their own

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 2/20/25 8:53 AM, Yosry Ahmed wrote:
On Mon, Feb 17, 2025 at 07:14:38PM -0800, JP Kobryn wrote:
The rstat infrastructure makes use of pointers for list management.
These pointers only exist as fields in the cgroup struct, so moving them
into their own struct will allow them to be used elsewhere. The base
stat entities are included with them for now.

Signed-off-by: JP Kobryn <inwardvessel@xxxxxxxxx>
---
  include/linux/cgroup-defs.h                   | 90 +-----------------
  include/linux/cgroup_rstat.h                  | 92 +++++++++++++++++++
  kernel/cgroup/cgroup.c                        |  3 +-
  kernel/cgroup/rstat.c                         | 27 +++---
  .../selftests/bpf/progs/btf_type_tag_percpu.c |  4 +-
  5 files changed, 112 insertions(+), 104 deletions(-)
  create mode 100644 include/linux/cgroup_rstat.h

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 1b20d2d8ef7c..6b6cc027fe70 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -17,7 +17,7 @@
  #include <linux/refcount.h>
  #include <linux/percpu-refcount.h>
  #include <linux/percpu-rwsem.h>
-#include <linux/u64_stats_sync.h>
+#include <linux/cgroup_rstat.h>
  #include <linux/workqueue.h>
  #include <linux/bpf-cgroup-defs.h>
  #include <linux/psi_types.h>
@@ -321,78 +321,6 @@ struct css_set {
  	struct rcu_head rcu_head;
  };
-struct cgroup_base_stat {
-	struct task_cputime cputime;
-
-#ifdef CONFIG_SCHED_CORE
-	u64 forceidle_sum;
-#endif
-	u64 ntime;
-};
-
-/*
- * rstat - cgroup scalable recursive statistics.  Accounting is done
- * per-cpu in cgroup_rstat_cpu which is then lazily propagated up the
- * hierarchy on reads.
- *
- * When a stat gets updated, the cgroup_rstat_cpu and its ancestors are
- * linked into the updated tree.  On the following read, propagation only
- * considers and consumes the updated tree.  This makes reading O(the
- * number of descendants which have been active since last read) instead of
- * O(the total number of descendants).
- *
- * This is important because there can be a lot of (draining) cgroups which
- * aren't active and stat may be read frequently.  The combination can
- * become very expensive.  By propagating selectively, increasing reading
- * frequency decreases the cost of each read.
- *
- * This struct hosts both the fields which implement the above -
- * updated_children and updated_next - and the fields which track basic
- * resource statistics on top of it - bsync, bstat and last_bstat.
- */
-struct cgroup_rstat_cpu {
-	/*
-	 * ->bsync protects ->bstat.  These are the only fields which get
-	 * updated in the hot path.
-	 */
-	struct u64_stats_sync bsync;
-	struct cgroup_base_stat bstat;
-
-	/*
-	 * Snapshots at the last reading.  These are used to calculate the
-	 * deltas to propagate to the global counters.
-	 */
-	struct cgroup_base_stat last_bstat;
-
-	/*
-	 * This field is used to record the cumulative per-cpu time of
-	 * the cgroup and its descendants. Currently it can be read via
-	 * eBPF/drgn etc, and we are still trying to determine how to
-	 * expose it in the cgroupfs interface.
-	 */
-	struct cgroup_base_stat subtree_bstat;
-
-	/*
-	 * Snapshots at the last reading. These are used to calculate the
-	 * deltas to propagate to the per-cpu subtree_bstat.
-	 */
-	struct cgroup_base_stat last_subtree_bstat;
-
-	/*
-	 * Child cgroups with stat updates on this cpu since the last read
-	 * are linked on the parent's ->updated_children through
-	 * ->updated_next.
-	 *
-	 * In addition to being more compact, singly-linked list pointing
-	 * to the cgroup makes it unnecessary for each per-cpu struct to
-	 * point back to the associated cgroup.
-	 *
-	 * Protected by per-cpu cgroup_rstat_cpu_lock.
-	 */
-	struct cgroup *updated_children;	/* terminated by self cgroup */
-	struct cgroup *updated_next;		/* NULL iff not on the list */
-};
-
  struct cgroup_freezer_state {
  	/* Should the cgroup and its descendants be frozen. */
  	bool freeze;
@@ -517,23 +445,9 @@ struct cgroup {
  	struct cgroup *old_dom_cgrp;		/* used while enabling threaded */
/* per-cpu recursive resource statistics */
-	struct cgroup_rstat_cpu __percpu *rstat_cpu;
+	struct cgroup_rstat rstat;
  	struct list_head rstat_css_list;
- /*
-	 * Add padding to separate the read mostly rstat_cpu and
-	 * rstat_css_list into a different cacheline from the following
-	 * rstat_flush_next and *bstat fields which can have frequent updates.
-	 */
-	CACHELINE_PADDING(_pad_);
-
-	/*
-	 * A singly-linked list of cgroup structures to be rstat flushed.
-	 * This is a scratch field to be used exclusively by
-	 * cgroup_rstat_flush_locked() and protected by cgroup_rstat_lock.
-	 */
-	struct cgroup	*rstat_flush_next;
-
  	/* cgroup basic resource statistics */
  	struct cgroup_base_stat last_bstat;
  	struct cgroup_base_stat bstat;
diff --git a/include/linux/cgroup_rstat.h b/include/linux/cgroup_rstat.h
new file mode 100644
index 000000000000..f95474d6f8ab
--- /dev/null
+++ b/include/linux/cgroup_rstat.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_RSTAT_H
+#define _LINUX_RSTAT_H
+
+#include <linux/u64_stats_sync.h>
+
+struct cgroup_rstat_cpu;

Why do we need the forward declaration instead of just defining struct
cgroup_rstat_cpu first? Also, why do we need a new header for these
definitions rather than just adding struct cgroup_rstat to
cgroup-defs.h?

The new header was added so the cgroup_rstat type can be used in bpf
cgroup-defs.h. As for the forward declaration, this was done so that
updated_next and updated children fields of the cgroup_rstat_cpu can
change type from from cgroup to cgroup_rstat.

Regardless, based on the direction we are moving with bpf sharing the
"self" tree, this new header will NOT be needed in v2.


+
+/*
+ * rstat - cgroup scalable recursive statistics.  Accounting is done
+ * per-cpu in cgroup_rstat_cpu which is then lazily propagated up the
+ * hierarchy on reads.
+ *
+ * When a stat gets updated, the cgroup_rstat_cpu and its ancestors are
+ * linked into the updated tree.  On the following read, propagation only
+ * considers and consumes the updated tree.  This makes reading O(the
+ * number of descendants which have been active since last read) instead of
+ * O(the total number of descendants).
+ *
+ * This is important because there can be a lot of (draining) cgroups which
+ * aren't active and stat may be read frequently.  The combination can
+ * become very expensive.  By propagating selectively, increasing reading
+ * frequency decreases the cost of each read.
+ *
+ * This struct hosts both the fields which implement the above -
+ * updated_children and updated_next - and the fields which track basic
+ * resource statistics on top of it - bsync, bstat and last_bstat.
+ */
+struct cgroup_rstat {
+	struct cgroup_rstat_cpu __percpu *rstat_cpu;
+
+	/*
+	 * Add padding to separate the read mostly rstat_cpu and
+	 * rstat_css_list into a different cacheline from the following
+	 * rstat_flush_next and containing struct fields which can have
+	 * frequent updates.
+	 */
+	CACHELINE_PADDING(_pad_);
+	struct cgroup *rstat_flush_next;
+};
+
+struct cgroup_base_stat {
+	struct task_cputime cputime;
+
+#ifdef CONFIG_SCHED_CORE
+	u64 forceidle_sum;
+#endif
+	u64 ntime;
+};
+
+struct cgroup_rstat_cpu {
+	/*
+	 * Child cgroups with stat updates on this cpu since the last read
+	 * are linked on the parent's ->updated_children through
+	 * ->updated_next.
+	 *
+	 * In addition to being more compact, singly-linked list pointing
+	 * to the cgroup makes it unnecessary for each per-cpu struct to
+	 * point back to the associated cgroup.
+	 */
+	struct cgroup *updated_children;	/* terminated by self */
+	struct cgroup *updated_next;		/* NULL if not on the list */
+
+	/*
+	 * ->bsync protects ->bstat.  These are the only fields which get
+	 * updated in the hot path.
+	 */
+	struct u64_stats_sync bsync;
+	struct cgroup_base_stat bstat;
+
+	/*
+	 * Snapshots at the last reading.  These are used to calculate the
+	 * deltas to propagate to the global counters.
+	 */
+	struct cgroup_base_stat last_bstat;
+
+	/*
+	 * This field is used to record the cumulative per-cpu time of
+	 * the cgroup and its descendants. Currently it can be read via
+	 * eBPF/drgn etc, and we are still trying to determine how to
+	 * expose it in the cgroupfs interface.
+	 */
+	struct cgroup_base_stat subtree_bstat;
+
+	/*
+	 * Snapshots at the last reading. These are used to calculate the
+	 * deltas to propagate to the per-cpu subtree_bstat.
+	 */
+	struct cgroup_base_stat last_subtree_bstat;
+};
+
+#endif	/* _LINUX_RSTAT_H */





[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux