Hi Fan, >-----Original Message----- >From: fan <nifan.cxl@xxxxxxxxx> >Sent: 16 February 2024 17:59 >To: Shiju Jose <shiju.jose@xxxxxxxxxx> >Cc: linux-cxl@xxxxxxxxxxxxxxx; linux-acpi@xxxxxxxxxxxxxxx; linux- >mm@xxxxxxxxx; dave@xxxxxxxxxxxx; Jonathan Cameron ><jonathan.cameron@xxxxxxxxxx>; dave.jiang@xxxxxxxxx; >alison.schofield@xxxxxxxxx; vishal.l.verma@xxxxxxxxx; ira.weiny@xxxxxxxxx; >dan.j.williams@xxxxxxxxx; linux-edac@xxxxxxxxxxxxxxx; linux- >kernel@xxxxxxxxxxxxxxx; david@xxxxxxxxxx; Vilas.Sridharan@xxxxxxx; >leo.duran@xxxxxxx; Yazen.Ghannam@xxxxxxx; rientjes@xxxxxxxxxx; >jiaqiyan@xxxxxxxxxx; tony.luck@xxxxxxxxx; Jon.Grimm@xxxxxxx; >dave.hansen@xxxxxxxxxxxxxxx; rafael@xxxxxxxxxx; lenb@xxxxxxxxxx; >naoya.horiguchi@xxxxxxx; james.morse@xxxxxxx; jthoughton@xxxxxxxxxx; >somasundaram.a@xxxxxxx; erdemaktas@xxxxxxxxxx; pgonda@xxxxxxxxxx; >duenwen@xxxxxxxxxx; mike.malvestuto@xxxxxxxxx; gthelen@xxxxxxxxxx; >wschwartz@xxxxxxxxxxxxxxxxxxx; dferguson@xxxxxxxxxxxxxxxxxxx; >tanxiaofei <tanxiaofei@xxxxxxxxxx>; Zengtao (B) <prime.zeng@xxxxxxxxxxxxx>; >kangkang.shen@xxxxxxxxxxxxx; wanghuiqiang <wanghuiqiang@xxxxxxxxxx>; >Linuxarm <linuxarm@xxxxxxxxxx> >Subject: Re: [RFC PATCH v5 05/12] cxl/memscrub: Add CXL device ECS control >feature > >On Thu, Jan 11, 2024 at 09:17:34PM +0800, shiju.jose@xxxxxxxxxx wrote: >> From: Shiju Jose <shiju.jose@xxxxxxxxxx> >> >> CXL spec 3.1 section 8.2.9.9.11.2 describes the DDR5 Error Check Scrub >> (ECS) control feature. >> >> The Error Check Scrub (ECS) is a feature defined in JEDEC DDR5 SDRAM >> Specification (JESD79-5) and allows the DRAM to internally read, >> correct single-bit errors, and write back corrected data bits to the >> DRAM array while providing transparency to error counts. The ECS >> control feature allows the request to configure ECS input >> configurations during system boot or at run-time. >> >> The ECS control allows the requester to change the log entry type, the >> ECS threshold count provided that the request is within the definition >> specified in DDR5 mode registers, change mode between codeword mode >> and row count mode, and reset the ECS counter. >> >> Open Question: >> Is cxl_mem_ecs_init() invoked in the right function in cxl/core/region.c? >> >> Signed-off-by: Shiju Jose <shiju.jose@xxxxxxxxxx> >> --- >> drivers/cxl/core/memscrub.c | 303 >+++++++++++++++++++++++++++++++++++- >> drivers/cxl/core/region.c | 1 + >> drivers/cxl/cxlmem.h | 3 + >> 3 files changed, 306 insertions(+), 1 deletion(-) >> >> diff --git a/drivers/cxl/core/memscrub.c b/drivers/cxl/core/memscrub.c >> index e0d482b0bf3a..e7741e2fdbdb 100644 >> --- a/drivers/cxl/core/memscrub.c >> +++ b/drivers/cxl/core/memscrub.c >> @@ -5,7 +5,7 @@ >> * Copyright (c) 2023 HiSilicon Limited. >> * >> * - Provides functions to configure patrol scrub >> - * feature of the CXL memory devices. >> + * and DDR5 ECS features of the CXL memory devices. >> */ >> >> #define pr_fmt(fmt) "CXL_MEM_SCRUB: " fmt >> @@ -264,3 +264,304 @@ int cxl_mem_patrol_scrub_init(struct cxl_memdev >*cxlmd) >> return 0; >> } >> EXPORT_SYMBOL_NS_GPL(cxl_mem_patrol_scrub_init, CXL); >> + >> +/* CXL DDR5 ECS control definitions */ >> +#define CXL_MEMDEV_ECS_GET_FEAT_VERSION 0x01 >> +#define CXL_MEMDEV_ECS_SET_FEAT_VERSION 0x01 >> + >> +static const uuid_t cxl_ecs_uuid = >> + UUID_INIT(0xe5b13f22, 0x2328, 0x4a14, 0xb8, 0xba, 0xb9, 0x69, 0x1e, >\ >> + 0x89, 0x33, 0x86); >> + >> +struct cxl_ecs_context { >> + struct device *dev; >> + u16 nregions; >> + int region_id; >> + u16 get_feat_size; >> + u16 set_feat_size; >> +}; >> + >> +/** >> + * struct cxl_memdev_ecs_params - CXL memory DDR5 ECS parameter data >structure. >> + * @log_entry_type: ECS log entry type, per DRAM or per memory media >FRU. >> + * @threshold: ECS threshold count per GB of memory cells. >> + * @mode: codeword/row count mode >> + * 0 : ECS counts rows with errors >> + * 1 : ECS counts codeword with errors >> + * @reset_counter: [IN] reset ECC counter to default value. >> + */ >> +struct cxl_memdev_ecs_params { >> + u8 log_entry_type; >> + u16 threshold; > >Why need to be u16? It has only 3 bits. Here u16 threshold stores the ECS threshold count 256/1024/4096 from the user and converted to the corresponding 3 bit value in cxl_mem_ecs_set_attrbs() below. Please see spec r3.1, Table 8-210. DDR5 ECS Control Feature Readable Attributes Bits[2:0]: ECS Threshold Count per Gb of Memory Cells: - 011b = 256 (default) - 100b = 1024 - 101b = 4096 > >> + u8 mode; >> + bool reset_counter; >> +}; >> + >> +enum { >> + CXL_MEMDEV_ECS_PARAM_LOG_ENTRY_TYPE = 0, >> + CXL_MEMDEV_ECS_PARAM_THRESHOLD, >> + CXL_MEMDEV_ECS_PARAM_MODE, >> + CXL_MEMDEV_ECS_PARAM_RESET_COUNTER, >> +}; >> + >> +#define CXL_MEMDEV_ECS_LOG_ENTRY_TYPE_MASK GENMASK(1, 0) >> +#define CXL_MEMDEV_ECS_REALTIME_REPORT_CAP_MASK BIT(0) >> +#define CXL_MEMDEV_ECS_THRESHOLD_COUNT_MASK > GENMASK(2, 0) >> +#define CXL_MEMDEV_ECS_MODE_MASK BIT(3) >> +#define CXL_MEMDEV_ECS_RESET_COUNTER_MASK BIT(4) >> + >> +static const u16 ecs_supp_threshold[] = { 0, 0, 0, 256, 1024, 4096 }; >> + >> +enum { >> + ECS_LOG_ENTRY_TYPE_DRAM = 0x0, >> + ECS_LOG_ENTRY_TYPE_MEM_MEDIA_FRU = 0x1, }; >> + >> +enum { >> + ECS_THRESHOLD_256 = 3, >> + ECS_THRESHOLD_1024 = 4, >> + ECS_THRESHOLD_4096 = 5, >> +}; >> + >> +enum { >> + ECS_MODE_COUNTS_ROWS = 0, >> + ECS_MODE_COUNTS_CODEWORDS = 1, >> +}; >> + >> +struct cxl_memdev_ecs_feat_read_attrbs { >> + u8 ecs_log_cap; >> + u8 ecs_cap; >> + __le16 ecs_config; >> + u8 ecs_flags; >> +} __packed; >> + >> +struct cxl_memdev_ecs_set_feat_pi { >> + struct cxl_mbox_set_feat_in pi; >> + struct cxl_memdev_ecs_feat_wr_attrbs { >> + u8 ecs_log_cap; >> + __le16 ecs_config; >> + } __packed wr_attrbs[]; >> +} __packed; >> + >> +/* CXL DDR5 ECS control functions */ >> +static int cxl_mem_ecs_get_attrbs(struct device *dev, int fru_id, >> + struct cxl_memdev_ecs_params *params) { >> + struct cxl_memdev_ecs_feat_read_attrbs *rd_attrbs __free(kvfree) = >NULL; >> + struct cxl_memdev *cxlmd = to_cxl_memdev(dev->parent); >> + struct cxl_dev_state *cxlds = cxlmd->cxlds; >> + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); >> + struct cxl_mbox_get_feat_in pi = { >> + .uuid = cxl_ecs_uuid, >> + .offset = 0, >> + .selection = CXL_GET_FEAT_SEL_CURRENT_VALUE, >> + }; >> + struct cxl_ecs_context *cxl_ecs_ctx; >> + u8 threshold_index; >> + int ret; >> + >> + if (!mds) >> + return -EFAULT; >> + cxl_ecs_ctx = dev_get_drvdata(dev); >> + >> + pi.count = cxl_ecs_ctx->get_feat_size; >> + rd_attrbs = kvmalloc(pi.count, GFP_KERNEL); >> + if (!rd_attrbs) >> + return -ENOMEM; >> + >> + ret = cxl_get_feature(mds, &pi, rd_attrbs); >> + if (ret) { >> + params->log_entry_type = 0; >> + params->threshold = 0; >> + params->mode = 0; >> + return ret; >> + } >> + params->log_entry_type = >FIELD_GET(CXL_MEMDEV_ECS_LOG_ENTRY_TYPE_MASK, >> + rd_attrbs[fru_id].ecs_log_cap); >> + threshold_index = >FIELD_GET(CXL_MEMDEV_ECS_THRESHOLD_COUNT_MASK, >> + rd_attrbs[fru_id].ecs_config); >> + params->threshold = ecs_supp_threshold[threshold_index]; >> + params->mode = FIELD_GET(CXL_MEMDEV_ECS_MODE_MASK, >> + rd_attrbs[fru_id].ecs_config); >> + >> + return 0; >> +} >> + >> +static int __maybe_unused >> +cxl_mem_ecs_set_attrbs(struct device *dev, int fru_id, >> + struct cxl_memdev_ecs_params *params, u8 param_type) { >> + struct cxl_memdev_ecs_feat_read_attrbs *rd_attrbs __free(kvfree) = >NULL; >> + struct cxl_memdev_ecs_set_feat_pi *set_pi __free(kvfree) = NULL; >> + struct cxl_memdev *cxlmd = to_cxl_memdev(dev->parent); >> + struct cxl_dev_state *cxlds = cxlmd->cxlds; >> + struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds); >> + struct cxl_mbox_get_feat_in pi = { >> + .uuid = cxl_ecs_uuid, >> + .offset = 0, >> + .selection = CXL_GET_FEAT_SEL_CURRENT_VALUE, >> + }; >> + struct cxl_memdev_ecs_feat_wr_attrbs *wr_attrbs; >> + struct cxl_memdev_ecs_params rd_params; >> + struct cxl_ecs_context *cxl_ecs_ctx; >> + u16 nmedia_frus, count; >> + u32 set_pi_size; >> + int ret; >> + >> + if (!mds) >> + return -EFAULT; >> + >> + cxl_ecs_ctx = dev_get_drvdata(dev); >> + nmedia_frus = cxl_ecs_ctx->nregions; >> + >> + rd_attrbs = kvmalloc(cxl_ecs_ctx->get_feat_size, GFP_KERNEL); >> + if (!rd_attrbs) >> + return -ENOMEM; >> + >> + pi.count = cxl_ecs_ctx->get_feat_size; >> + ret = cxl_get_feature(mds, &pi, rd_attrbs); >> + if (ret) >> + return ret; >> + set_pi_size = sizeof(struct cxl_mbox_set_feat_in) + >> + cxl_ecs_ctx->set_feat_size; >> + set_pi = kvmalloc(set_pi_size, GFP_KERNEL); >> + if (!set_pi) >> + return -ENOMEM; >> + >> + set_pi->pi.uuid = cxl_ecs_uuid; >> + set_pi->pi.flags = >CXL_SET_FEAT_FLAG_MOD_VALUE_SAVED_ACROSS_RESET | >> + CXL_SET_FEAT_FLAG_FULL_DATA_TRANSFER; >> + set_pi->pi.offset = 0; >> + set_pi->pi.version = CXL_MEMDEV_ECS_SET_FEAT_VERSION; >> + /* Fill writable attributes from the current attributes read for all the >media FRUs */ >> + wr_attrbs = set_pi->wr_attrbs; >> + for (count = 0; count < nmedia_frus; count++) { >> + wr_attrbs[count].ecs_log_cap = rd_attrbs[count].ecs_log_cap; >> + wr_attrbs[count].ecs_config = rd_attrbs[count].ecs_config; >> + } >> + >> + /* Fill attribute to be set for the media FRU */ >> + switch (param_type) { >> + case CXL_MEMDEV_ECS_PARAM_LOG_ENTRY_TYPE: >> + if (params->log_entry_type != ECS_LOG_ENTRY_TYPE_DRAM >&& >> + params->log_entry_type != >ECS_LOG_ENTRY_TYPE_MEM_MEDIA_FRU) { >> + dev_err(dev->parent, >> + "Invalid CXL ECS scrub log entry type(%d) to >set\n", >> + params->log_entry_type); >> + dev_err(dev->parent, >> + "Log Entry Type 0: per DRAM 1: per Memory >Media FRU\n"); >> + return -EINVAL; >> + } >> + wr_attrbs[fru_id].ecs_log_cap = >FIELD_PREP(CXL_MEMDEV_ECS_LOG_ENTRY_TYPE_MASK, >> + params- >>log_entry_type); >> + break; >> + case CXL_MEMDEV_ECS_PARAM_THRESHOLD: >> + wr_attrbs[fru_id].ecs_config &= >~CXL_MEMDEV_ECS_THRESHOLD_COUNT_MASK; >> + switch (params->threshold) { >> + case 256: >> + wr_attrbs[fru_id].ecs_config |= FIELD_PREP( >> + > CXL_MEMDEV_ECS_THRESHOLD_COUNT_MASK, >> + ECS_THRESHOLD_256); >> + break; >> + case 1024: >> + wr_attrbs[fru_id].ecs_config |= FIELD_PREP( >> + > CXL_MEMDEV_ECS_THRESHOLD_COUNT_MASK, >> + ECS_THRESHOLD_1024); >> + break; >> + case 4096: >> + wr_attrbs[fru_id].ecs_config |= FIELD_PREP( >> + > CXL_MEMDEV_ECS_THRESHOLD_COUNT_MASK, >> + ECS_THRESHOLD_4096); >> + break; >> + default: >> + dev_err(dev->parent, >> + "Invalid CXL ECS scrub threshold count(%d) to >set\n", >> + params->threshold); >> + dev_err(dev->parent, >> + "Supported scrub threshold count: >256,1024,4096\n"); >> + return -EINVAL; >> + } >> + break; >> + case CXL_MEMDEV_ECS_PARAM_MODE: >> + if (params->mode != ECS_MODE_COUNTS_ROWS && >> + params->mode != ECS_MODE_COUNTS_CODEWORDS) { >> + dev_err(dev->parent, >> + "Invalid CXL ECS scrub mode(%d) to set\n", >> + params->mode); >> + dev_err(dev->parent, >> + "Mode 0: ECS counts rows with errors" >> + " 1: ECS counts codewords with errors\n"); >> + return -EINVAL; >> + } >> + wr_attrbs[fru_id].ecs_config &= >~CXL_MEMDEV_ECS_MODE_MASK; >> + wr_attrbs[fru_id].ecs_config |= >FIELD_PREP(CXL_MEMDEV_ECS_MODE_MASK, >> + params->mode); >> + break; >> + case CXL_MEMDEV_ECS_PARAM_RESET_COUNTER: >> + wr_attrbs[fru_id].ecs_config &= >~CXL_MEMDEV_ECS_RESET_COUNTER_MASK; >> + wr_attrbs[fru_id].ecs_config |= >FIELD_PREP(CXL_MEMDEV_ECS_RESET_COUNTER_MASK, >> + params- >>reset_counter); >> + break; >> + default: >> + dev_err(dev->parent, "Invalid CXL ECS parameter to set\n"); >> + return -EINVAL; >> + } >> + ret = cxl_set_feature(mds, set_pi, set_pi_size); >> + if (ret) { >> + dev_err(dev->parent, "CXL ECS set feature fail ret=%d\n", ret); >> + return ret; >> + } >> + >> + /* Verify attribute is set successfully */ >> + ret = cxl_mem_ecs_get_attrbs(dev, fru_id, &rd_params); >> + if (ret) { >> + dev_err(dev->parent, "Get cxlmemdev ECS params fail >ret=%d\n", ret); >> + return ret; >> + } >> + switch (param_type) { >> + case CXL_MEMDEV_ECS_PARAM_LOG_ENTRY_TYPE: >> + if (rd_params.log_entry_type != params->log_entry_type) >> + return -EFAULT; >> + break; >> + case CXL_MEMDEV_ECS_PARAM_THRESHOLD: >> + if (rd_params.threshold != params->threshold) >> + return -EFAULT; >> + break; >> + case CXL_MEMDEV_ECS_PARAM_MODE: >> + if (rd_params.mode != params->mode) >> + return -EFAULT; >> + break; >> + } >> + >> + return 0; >> +} >> + >> +int cxl_mem_ecs_init(struct cxl_memdev *cxlmd, int region_id) { >> + struct cxl_mbox_supp_feat_entry feat_entry; >> + struct cxl_ecs_context *cxl_ecs_ctx; >> + int nmedia_frus; >> + int ret; >> + >> + ret = cxl_mem_get_supported_feature_entry(cxlmd, &cxl_ecs_uuid, >&feat_entry); >> + if (ret < 0) >> + return ret; >> + >> + if (!(feat_entry.attrb_flags & CXL_FEAT_ENTRY_FLAG_CHANGABLE)) >> + return -ENOTSUPP; >> + nmedia_frus = feat_entry.get_feat_size/ >> + sizeof(struct >cxl_memdev_ecs_feat_read_attrbs); >> + if (nmedia_frus) { >> + cxl_ecs_ctx = devm_kzalloc(&cxlmd->dev, sizeof(*cxl_ecs_ctx), >GFP_KERNEL); >> + if (!cxl_ecs_ctx) >> + return -ENOMEM; >> + >> + cxl_ecs_ctx->nregions = nmedia_frus; >> + cxl_ecs_ctx->get_feat_size = feat_entry.get_feat_size; >> + cxl_ecs_ctx->set_feat_size = feat_entry.set_feat_size; >> + cxl_ecs_ctx->region_id = region_id; >> + } >> + >> + return 0; >> +} >> +EXPORT_SYMBOL_NS_GPL(cxl_mem_ecs_init, CXL); >> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c >> index 3e817a6f94c6..ca71ad403d62 100644 >> --- a/drivers/cxl/core/region.c >> +++ b/drivers/cxl/core/region.c >> @@ -2912,6 +2912,7 @@ int cxl_add_to_region(struct cxl_port *root, struct >cxl_endpoint_decoder *cxled) >> dev_err(&cxlr->dev, "failed to enable, range: %pr\n", >> p->res); >> } >> + cxl_mem_ecs_init(cxlmd, atomic_read(&cxlrd->region_id)); > >Check the return value and process accordingly. Sure. > >Fan > >> >> put_device(region_dev); >> out: >> diff --git a/drivers/cxl/cxlmem.h b/drivers/cxl/cxlmem.h index >> 25c46e72af16..c255063dd795 100644 >> --- a/drivers/cxl/cxlmem.h >> +++ b/drivers/cxl/cxlmem.h >> @@ -986,9 +986,12 @@ int cxl_clear_poison(struct cxl_memdev *cxlmd, >> u64 dpa); >> /* cxl memory scrub functions */ >> #ifdef CONFIG_CXL_SCRUB >> int cxl_mem_patrol_scrub_init(struct cxl_memdev *cxlmd); >> +int cxl_mem_ecs_init(struct cxl_memdev *cxlmd, int region_id); >> #else >> static inline int cxl_mem_patrol_scrub_init(struct cxl_memdev *cxlmd) >> { return -ENOTSUPP; } >> +static inline int cxl_mem_ecs_init(struct cxl_memdev *cxlmd, int >> +region_id) { return -ENOTSUPP; } >> #endif >> >> #ifdef CONFIG_CXL_SUSPEND >> -- >> 2.34.1 >> Thanks, Shiju