On 2023/1/20 11:46, Jiaqi Yan wrote:
Today kernel provides following memory error info to userspace, but each
has its own disadvantage
* HardwareCorrupted in /proc/meminfo: number of bytes poisoned in total,
not per NUMA node stats though
* ras:memory_failure_event: only available after explicitly enabled
* /dev/mcelog provides many useful info about the MCEs, but
doesn't capture how memory_failure recovered memory MCEs
* kernel logs: userspace needs to process log text
Exposes per NUMA node memory error stats as sysfs entries:
/sys/devices/system/node/node${X}/memory_failure/total
/sys/devices/system/node/node${X}/memory_failure/recovered
/sys/devices/system/node/node${X}/memory_failure/ignored
/sys/devices/system/node/node${X}/memory_failure/failed
/sys/devices/system/node/node${X}/memory_failure/delayed
These counters describe how many raw pages are poisoned and after the
attempted recoveries by the kernel, their resolutions: how many are
recovered, ignored, failed, or delayed respectively. The following
math holds for the statistics:
* total = recovered + ignored + failed + delayed
Acked-by: David Rientjes <rientjes@xxxxxxxxxx>
Signed-off-by: Jiaqi Yan <jiaqiyan@xxxxxxxxxx>
---
drivers/base/node.c | 3 +++
include/linux/mm.h | 5 +++++
include/linux/mmzone.h | 28 ++++++++++++++++++++++++++++
mm/memory-failure.c | 35 +++++++++++++++++++++++++++++++++++
4 files changed, 71 insertions(+)
diff --git a/drivers/base/node.c b/drivers/base/node.c
index faf3597a96da..b46db17124f3 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -586,6 +586,9 @@ static const struct attribute_group *node_dev_groups[] = {
&node_dev_group,
#ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP
&arch_node_dev_group,
+#endif
+#ifdef CONFIG_MEMORY_FAILURE
+ &memory_failure_attr_group,
#endif
NULL
};
diff --git a/include/linux/mm.h b/include/linux/mm.h
index f3f196e4d66d..888576884eb9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3521,6 +3521,11 @@ enum mf_action_page_type {
MF_MSG_UNKNOWN,
};
+/*
+ * Sysfs entries for memory failure handling statistics.
+ */
+extern const struct attribute_group memory_failure_attr_group;
+
This should move under CONFIG_MEMORY_FAILURE
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
extern void clear_huge_page(struct page *page,
unsigned long addr_hint,
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index cd28a100d9e4..2c537b31fa7b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1110,6 +1110,31 @@ struct deferred_split {
};
#endif
+#ifdef CONFIG_MEMORY_FAILURE
+/*
+ * Per NUMA node memory failure handling statistics.
+ */
+struct memory_failure_stats {
+ /*
+ * Number of raw pages poisoned.
+ * Cases not accounted: memory outside kernel control, offline page,
+ * arch-specific memory_failure (SGX), hwpoison_filter() filtered
+ * error events, and unpoison actions from hwpoison_unpoison.
+ */
+ unsigned long total;
+ /*
+ * Recovery results of poisoned raw pages handled by memory_failure,
+ * in sync with mf_result.
+ * total = ignored + failed + delayed + recovered.
+ * total * PAGE_SIZE * #nodes = /proc/meminfo/HardwareCorrupted.
+ */
+ unsigned long ignored;
+ unsigned long failed;
+ unsigned long delayed;
+ unsigned long recovered;
+};
+#endif
+
/*
* On NUMA machines, each NUMA node would have a pg_data_t to describe
* it's memory layout. On UMA machines there is a single pglist_data which
@@ -1253,6 +1278,9 @@ typedef struct pglist_data {
#ifdef CONFIG_NUMA
struct memory_tier __rcu *memtier;
#endif
+#ifdef CONFIG_MEMORY_FAILURE
+ struct memory_failure_stats mf_stats;
+#endif
} pg_data_t;
#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c77a9e37e27e..c628f1db3a4d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -87,6 +87,41 @@ inline void num_poisoned_pages_sub(unsigned long pfn, long i)
memblk_nr_poison_sub(pfn, i);
}
+/**
+ * MF_ATTR_RO - Create sysfs entry for each memory failure statistics.
+ * @_name: name of the file in the per NUMA sysfs directory.
+ */
+#define MF_ATTR_RO(_name) \
+static ssize_t _name##_show(struct device *dev, \
+ struct device_attribute *attr, \
+ char *buf) \
+{ \
+ struct memory_failure_stats *mf_stats = \
+ &NODE_DATA(dev->id)->mf_stats; \
+ return sprintf(buf, "%lu\n", mf_stats->_name); \
+} \
+static DEVICE_ATTR_RO(_name)
+
+MF_ATTR_RO(total);
+MF_ATTR_RO(ignored);
+MF_ATTR_RO(failed);
+MF_ATTR_RO(delayed);
+MF_ATTR_RO(recovered);
+
+static struct attribute *memory_failure_attr[] = {
+ &dev_attr_total.attr,
+ &dev_attr_ignored.attr,
+ &dev_attr_failed.attr,
+ &dev_attr_delayed.attr,
+ &dev_attr_recovered.attr,
+ NULL,
+};
+
+const struct attribute_group memory_failure_attr_group = {
+ .name = "memory_failure",
+ .attrs = memory_failure_attr,
+};
+
/*
* Return values:
* 1: the page is dissolved (if needed) and taken off from buddy,