Hi Fan, Thanks for the comments. Sorry for the delay. >-----Original Message----- >From: fan <nifan.cxl@xxxxxxxxx> >Sent: 19 July 2024 19:43 >To: Shiju Jose <shiju.jose@xxxxxxxxxx> >Cc: linux-edac@xxxxxxxxxxxxxxx; linux-cxl@xxxxxxxxxxxxxxx; linux- >acpi@xxxxxxxxxxxxxxx; linux-mm@xxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx; >bp@xxxxxxxxx; tony.luck@xxxxxxxxx; rafael@xxxxxxxxxx; lenb@xxxxxxxxxx; >mchehab@xxxxxxxxxx; dan.j.williams@xxxxxxxxx; dave@xxxxxxxxxxxx; Jonathan >Cameron <jonathan.cameron@xxxxxxxxxx>; dave.jiang@xxxxxxxxx; >alison.schofield@xxxxxxxxx; vishal.l.verma@xxxxxxxxx; ira.weiny@xxxxxxxxx; >david@xxxxxxxxxx; Vilas.Sridharan@xxxxxxx; leo.duran@xxxxxxx; >Yazen.Ghannam@xxxxxxx; rientjes@xxxxxxxxxx; jiaqiyan@xxxxxxxxxx; >Jon.Grimm@xxxxxxx; dave.hansen@xxxxxxxxxxxxxxx; >naoya.horiguchi@xxxxxxx; james.morse@xxxxxxx; jthoughton@xxxxxxxxxx; >somasundaram.a@xxxxxxx; erdemaktas@xxxxxxxxxx; pgonda@xxxxxxxxxx; >duenwen@xxxxxxxxxx; mike.malvestuto@xxxxxxxxx; gthelen@xxxxxxxxxx; >wschwartz@xxxxxxxxxxxxxxxxxxx; dferguson@xxxxxxxxxxxxxxxxxxx; >wbs@xxxxxxxxxxxxxxxxxxxxxx; nifan.cxl@xxxxxxxxx; tanxiaofei ><tanxiaofei@xxxxxxxxxx>; Zengtao (B) <prime.zeng@xxxxxxxxxxxxx>; Roberto >Sassu <roberto.sassu@xxxxxxxxxx>; kangkang.shen@xxxxxxxxxxxxx; >wanghuiqiang <wanghuiqiang@xxxxxxxxxx>; Linuxarm ><linuxarm@xxxxxxxxxx> >Subject: Re: [RFC PATCH v9 08/11] cxl/memscrub: Add CXL memory device ECS >control feature > >On Tue, Jul 16, 2024 at 04:03:32PM +0100, shiju.jose@xxxxxxxxxx wrote: >> From: Shiju Jose <shiju.jose@xxxxxxxxxx> >> >> CXL spec 3.1 section 8.2.9.9.11.2 describes the DDR5 Error Check Scrub >> (ECS) control feature. >> The Error Check Scrub (ECS) is a feature defined in JEDEC DDR5 SDRAM >> Specification (JESD79-5) and allows the DRAM to internally read, >> correct single-bit errors, and write back corrected data bits to the >> DRAM array while providing transparency to error counts. >> >> The ECS control allows the requester to change the log entry type, the >> ECS threshold count provided that the request is within the definition >> specified in DDR5 mode registers, change mode between codeword mode >> and row count mode, and reset the ECS counter. >> >> Register with EDAC RAS control feature driver, which gets the ECS attr >> descriptors from the EDAC ECS and expose sysfs ECS control attributes >> to the userspace. >> For example ECS control for the memory media FRU 0 in CXL mem0 device >> is in /sys/bus/edac/devices/cxl_mem0/ecs_fru0/ >> >> Note: The documentation can be added if necessary. >> >> Signed-off-by: Shiju Jose <shiju.jose@xxxxxxxxxx> >> --- > >Some lines are too long. And some other comments inline. Wil fix. > >> drivers/cxl/core/memscrub.c | 429 >> ++++++++++++++++++++++++++++++++++++ >> 1 file changed, 429 insertions(+) >> >> diff --git a/drivers/cxl/core/memscrub.c b/drivers/cxl/core/memscrub.c >> index 430f85b01f6c..9be230ea989a 100644 >> --- a/drivers/cxl/core/memscrub.c >> +++ b/drivers/cxl/core/memscrub.c >> @@ -351,13 +351,411 @@ static const struct edac_scrub_ops >cxl_ps_scrub_ops = { >> .cycle_in_hours_range = cxl_patrol_scrub_read_scrub_cycle_hrs_range, >> }; >> >> +/* CXL DDR5 ECS control definitions */ >> +#define CXL_MEMDEV_ECS_GET_FEAT_VERSION 0x01 >> +#define CXL_MEMDEV_ECS_SET_FEAT_VERSION 0x01 >> + >> +static const uuid_t cxl_ecs_uuid = >> + UUID_INIT(0xe5b13f22, 0x2328, 0x4a14, 0xb8, 0xba, 0xb9, 0x69, 0x1e, >\ >> + 0x89, 0x33, 0x86); >> + >> +struct cxl_ecs_context { >> + u16 num_media_frus; >> + u16 get_feat_size; >> + u16 set_feat_size; >> + struct cxl_memdev *cxlmd; >> +}; >> + >> +/** >> + * struct cxl_ecs_params - CXL memory DDR5 ECS parameter data structure. >> + * @log_entry_type: ECS log entry type, per DRAM or per memory media >FRU. >> + * @threshold: ECS threshold count per GB of memory cells. >> + * @mode: codeword/row count mode >> + * 0 : ECS counts rows with errors >> + * 1 : ECS counts codeword with errors >> + * @reset_counter: [IN] reset ECC counter to default value. >> + */ >> +struct cxl_ecs_params { >> + u8 log_entry_type; >> + u16 threshold; >> + u8 mode; > >An enum is defined below, why not directly use enum type here? Will do. > >> + bool reset_counter; >> +}; >> + >> +enum { >> + CXL_ECS_PARAM_LOG_ENTRY_TYPE, >> + CXL_ECS_PARAM_THRESHOLD, >> + CXL_ECS_PARAM_MODE, >> + CXL_ECS_PARAM_RESET_COUNTER, >> +}; >> + >> +#define CXL_ECS_LOG_ENTRY_TYPE_MASK GENMASK(1, 0) >> +#define CXL_ECS_REALTIME_REPORT_CAP_MASK BIT(0) >> +#define CXL_ECS_THRESHOLD_COUNT_MASK GENMASK(2, 0) >> +#define CXL_ECS_MODE_MASK BIT(3) >> +#define CXL_ECS_RESET_COUNTER_MASK BIT(4) >> + >> +static const u16 ecs_supp_threshold[] = { 0, 0, 0, 256, 1024, 4096 }; >> + >> +enum { >> + ECS_LOG_ENTRY_TYPE_DRAM = 0x0, >> + ECS_LOG_ENTRY_TYPE_MEM_MEDIA_FRU = 0x1, }; >> + >> +enum { >> + ECS_THRESHOLD_256 = 3, >> + ECS_THRESHOLD_1024 = 4, >> + ECS_THRESHOLD_4096 = 5, >> +}; >> + >> +enum { >> + ECS_MODE_COUNTS_ROWS = 0, >> + ECS_MODE_COUNTS_CODEWORDS = 1, >> +}; >> + >> +struct cxl_ecs_rd_attrs { >> + u8 ecs_log_cap; >> + u8 ecs_cap; >> + __le16 ecs_config; >> + u8 ecs_flags; >> +} __packed; >> + >> +struct cxl_ecs_wr_attrs { >> + u8 ecs_log_cap; >> + __le16 ecs_config; >> +} __packed; >> + >> +/* CXL DDR5 ECS control functions */ >> +static int cxl_mem_ecs_get_attrs(struct device *dev, void *drv_data, int >fru_id, >> + struct cxl_ecs_params *params) >> +{ >> + struct cxl_ecs_context *cxl_ecs_ctx = drv_data; >> + struct cxl_memdev *cxlmd = cxl_ecs_ctx->cxlmd; >> + struct cxl_dev_state *cxlds = cxlmd->cxlds; >> + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); >> + size_t rd_data_size; >> + u8 threshold_index; >> + size_t data_size; >> + >> + rd_data_size = cxl_ecs_ctx->get_feat_size; >> + >> + struct cxl_ecs_rd_attrs *rd_attrs __free(kfree) = >> + kmalloc(rd_data_size, GFP_KERNEL); >> + if (!rd_attrs) >> + return -ENOMEM; >> + >> + params->log_entry_type = 0; >> + params->threshold = 0; >> + params->mode = 0; >> + data_size = cxl_get_feature(mds, cxl_ecs_uuid, rd_attrs, >> + rd_data_size, >CXL_GET_FEAT_SEL_CURRENT_VALUE); >> + if (!data_size) >> + return -EIO; >> + >> + params->log_entry_type = >FIELD_GET(CXL_ECS_LOG_ENTRY_TYPE_MASK, >> + rd_attrs[fru_id].ecs_log_cap); >> + threshold_index = FIELD_GET(CXL_ECS_THRESHOLD_COUNT_MASK, >> + rd_attrs[fru_id].ecs_config); >> + params->threshold = ecs_supp_threshold[threshold_index]; >> + params->mode = FIELD_GET(CXL_ECS_MODE_MASK, >> + rd_attrs[fru_id].ecs_config); >> + return 0; >> +} >> + >> +static int cxl_mem_ecs_set_attrs(struct device *dev, void *drv_data, int >fru_id, >> + struct cxl_ecs_params *params, u8 >param_type) { >> + struct cxl_ecs_context *cxl_ecs_ctx = drv_data; >> + struct cxl_memdev *cxlmd = cxl_ecs_ctx->cxlmd; >> + struct cxl_dev_state *cxlds = cxlmd->cxlds; >> + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); >> + size_t rd_data_size, wr_data_size; >> + u16 num_media_frus, count; >> + size_t data_size; >> + int ret; >> + >> + num_media_frus = cxl_ecs_ctx->num_media_frus; >> + rd_data_size = cxl_ecs_ctx->get_feat_size; >> + wr_data_size = cxl_ecs_ctx->set_feat_size; >> + struct cxl_ecs_rd_attrs *rd_attrs __free(kfree) = >> + kmalloc(rd_data_size, GFP_KERNEL); >> + if (!rd_attrs) >> + return -ENOMEM; >> + >> + data_size = cxl_get_feature(mds, cxl_ecs_uuid, rd_attrs, >> + rd_data_size, >CXL_GET_FEAT_SEL_CURRENT_VALUE); >> + if (!data_size) >> + return -EIO; >> + struct cxl_ecs_wr_attrs *wr_attrs __free(kfree) = >> + kmalloc(wr_data_size, GFP_KERNEL); >> + if (!wr_attrs) >> + return -ENOMEM; >> + >> + /* Fill writable attributes from the current attributes read for all the >media FRUs */ >> + for (count = 0; count < num_media_frus; count++) { >> + wr_attrs[count].ecs_log_cap = rd_attrs[count].ecs_log_cap; >> + wr_attrs[count].ecs_config = rd_attrs[count].ecs_config; >> + } >> + >> + /* Fill attribute to be set for the media FRU */ >> + switch (param_type) { >> + case CXL_ECS_PARAM_LOG_ENTRY_TYPE: >> + if (params->log_entry_type != ECS_LOG_ENTRY_TYPE_DRAM >&& >> + params->log_entry_type != >ECS_LOG_ENTRY_TYPE_MEM_MEDIA_FRU) { >> + dev_err(dev, >> + "Invalid CXL ECS scrub log entry type(%d) to >set\n", >> + params->log_entry_type); >> + dev_err(dev, >> + "Log Entry Type 0: per DRAM 1: per Memory >Media FRU\n"); >> + return -EINVAL; >> + } >> + wr_attrs[fru_id].ecs_log_cap = >FIELD_PREP(CXL_ECS_LOG_ENTRY_TYPE_MASK, >> + params- >>log_entry_type); >> + break; >> + case CXL_ECS_PARAM_THRESHOLD: >> + wr_attrs[fru_id].ecs_config &= >~CXL_ECS_THRESHOLD_COUNT_MASK; >> + switch (params->threshold) { >> + case 256: >> + wr_attrs[fru_id].ecs_config |= FIELD_PREP( >> + CXL_ECS_THRESHOLD_COUNT_MASK, >> + ECS_THRESHOLD_256); >> + break; >> + case 1024: >> + wr_attrs[fru_id].ecs_config |= FIELD_PREP( >> + > CXL_ECS_THRESHOLD_COUNT_MASK, >> + ECS_THRESHOLD_1024); >> + break; >> + case 4096: >> + wr_attrs[fru_id].ecs_config |= FIELD_PREP( >> + > CXL_ECS_THRESHOLD_COUNT_MASK, >> + ECS_THRESHOLD_4096); >> + break; >> + default: >> + dev_err(dev, >> + "Invalid CXL ECS scrub threshold count(%d) to >set\n", >> + params->threshold); >> + dev_err(dev, >> + "Supported scrub threshold count: >256,1024,4096\n"); >> + return -EINVAL; >> + } >> + break; >> + case CXL_ECS_PARAM_MODE: >> + if (params->mode != ECS_MODE_COUNTS_ROWS && >> + params->mode != ECS_MODE_COUNTS_CODEWORDS) { >> + dev_err(dev, >> + "Invalid CXL ECS scrub mode(%d) to set\n", >> + params->mode); >> + dev_err(dev, >> + "Mode 0: ECS counts rows with errors" >> + " 1: ECS counts codewords with errors\n"); >> + return -EINVAL; >> + } >> + wr_attrs[fru_id].ecs_config &= ~CXL_ECS_MODE_MASK; >> + wr_attrs[fru_id].ecs_config |= >FIELD_PREP(CXL_ECS_MODE_MASK, >> + params->mode); >> + break; >> + case CXL_ECS_PARAM_RESET_COUNTER: >> + wr_attrs[fru_id].ecs_config &= >~CXL_ECS_RESET_COUNTER_MASK; >> + wr_attrs[fru_id].ecs_config |= >FIELD_PREP(CXL_ECS_RESET_COUNTER_MASK, >> + params- >>reset_counter); >> + break; >> + default: >> + dev_err(dev, "Invalid CXL ECS parameter to set\n"); >> + return -EINVAL; >> + } >> + >> + ret = cxl_set_feature(mds, cxl_ecs_uuid, >CXL_MEMDEV_ECS_SET_FEAT_VERSION, >> + wr_attrs, wr_data_size, >> + >CXL_SET_FEAT_FLAG_DATA_SAVED_ACROSS_RESET); >> + if (ret) { >> + dev_err(dev, "CXL ECS set feature failed ret=%d\n", ret); >> + return ret; >> + } >> + >> + return 0; >> +} >> + >> +static int cxl_ecs_get_log_entry_type(struct device *dev, void >> +*drv_data, int fru_id, u64 *val) { >> + struct cxl_ecs_params params; >> + int ret; >> + >> + ret = cxl_mem_ecs_get_attrs(dev, drv_data, fru_id, ¶ms); >> + if (ret) >> + return ret; >> + >> + *val = params.log_entry_type; >> + >> + return 0; >> +} >> + >> +static int cxl_ecs_set_log_entry_type(struct device *dev, void >> +*drv_data, int fru_id, u64 val) { >> + struct cxl_ecs_params params = { >> + .log_entry_type = val, >> + }; >> + >> + return cxl_mem_ecs_set_attrs(dev, drv_data, fru_id, ¶ms, >> +CXL_ECS_PARAM_LOG_ENTRY_TYPE); } >> + >> +static int cxl_ecs_get_log_entry_type_per_dram(struct device *dev, void >*drv_data, >> + int fru_id, u64 *val) > >I may have missed something. We have cxl_ecs_get_log_entry_type, and what is >cxl_ecs_get_log_entry_type_per_memory_media and >cxl_ecs_get_log_entry_type_per_dram for? Reason for adding these readonly attributes to avoid user need to check the spec to interpret the value set or the supported options for ECS log type.