Re: [PATCH v12 11/17] cxl/memfeature: Add CXL memory device ECS control feature

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Sep 11, 2024 at 10:04:40AM +0100, shiju.jose@xxxxxxxxxx wrote:
> From: Shiju Jose <shiju.jose@xxxxxxxxxx>
> 
> CXL spec 3.1 section 8.2.9.9.11.2 describes the DDR5 ECS (Error Check
> Scrub) control feature.
> The Error Check Scrub (ECS) is a feature defined in JEDEC DDR5 SDRAM
> Specification (JESD79-5) and allows the DRAM to internally read, correct
> single-bit errors, and write back corrected data bits to the DRAM array
> while providing transparency to error counts.
> 
> The ECS control allows the requester to change the log entry type, the ECS
> threshold count provided that the request is within the definition
> specified in DDR5 mode registers, change mode between codeword mode and
> row count mode, and reset the ECS counter.
> 
> Register with EDAC RAS control feature driver, which gets the ECS attr
> descriptors from the EDAC ECS and expose sysfs ECS control attributes
> to the userspace.
> For example ECS control for the memory media FRU 0 in CXL mem0 device is
> in /sys/bus/edac/devices/cxl_mem0/ecs_fru0/
> 
> Signed-off-by: Shiju Jose <shiju.jose@xxxxxxxxxx>
> ---
>  drivers/cxl/core/memfeature.c | 439 +++++++++++++++++++++++++++++++++-
>  1 file changed, 438 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/cxl/core/memfeature.c b/drivers/cxl/core/memfeature.c
> index 90c68d20b02b..5d4057fa304c 100644
> --- a/drivers/cxl/core/memfeature.c
> +++ b/drivers/cxl/core/memfeature.c
> @@ -19,7 +19,7 @@
>  #include <cxl.h>
>  #include <linux/edac.h>
>  
> -#define CXL_DEV_NUM_RAS_FEATURES	1
> +#define CXL_DEV_NUM_RAS_FEATURES	2
>  #define CXL_DEV_HOUR_IN_SECS	3600
>  
>  #define CXL_SCRUB_NAME_LEN	128
> @@ -303,6 +303,405 @@ static const struct edac_scrub_ops cxl_ps_scrub_ops = {
>  	.cycle_duration_write = cxl_patrol_scrub_write_scrub_cycle,
>  };
>  
...
> +	case CXL_ECS_PARAM_THRESHOLD:
> +		wr_attrs[fru_id].ecs_config &= ~CXL_ECS_THRESHOLD_COUNT_MASK;
> +		switch (params->threshold) {
> +		case 256:
> +			wr_attrs[fru_id].ecs_config |= FIELD_PREP(CXL_ECS_THRESHOLD_COUNT_MASK,
> +								  ECS_THRESHOLD_256);
> +			break;
> +		case 1024:
> +			wr_attrs[fru_id].ecs_config |= FIELD_PREP(CXL_ECS_THRESHOLD_COUNT_MASK,
> +								  ECS_THRESHOLD_1024);
> +			break;
> +		case 4096:
> +			wr_attrs[fru_id].ecs_config |= FIELD_PREP(CXL_ECS_THRESHOLD_COUNT_MASK,
> +								  ECS_THRESHOLD_4096);
> +			break;
> +		default:
> +			dev_err(dev,
> +				"Invalid CXL ECS scrub threshold count(%d) to set\n",
> +				params->threshold);
> +			dev_err(dev,
> +				"Supported scrub threshold count: 256,1024,4096\n");
> +			return -EINVAL;
> +		}
> +		break;
> +	case CXL_ECS_PARAM_MODE:
> +		if (params->mode != ECS_MODE_COUNTS_ROWS &&
> +		    params->mode != ECS_MODE_COUNTS_CODEWORDS) {
> +			dev_err(dev,
> +				"Invalid CXL ECS scrub mode(%d) to set\n",
> +				params->mode);
> +			dev_err(dev,
> +				"Mode 0: ECS counts rows with errors"
> +				" 1: ECS counts codewords with errors\n");
The messaging here can be improved. When printed out in dmesg, it looks
like

root@localhost:~# echo 2 > /sys/bus/edac/devices/cxl_mem0/ecs_fru0/mode
----
[ 6099.073006] cxl_mem mem0: Invalid CXL ECS scrub mode(2) to set
[ 6099.074407] cxl_mem mem0: Mode 0: ECS counts rows with errors 1: ECS counts codewords with errors
----
Maybe use similar message format as threshold above, like
+			dev_err(dev,
+				"Supported ECS mode: 0: ECS counts rows with errors; 1: ECS counts codewords with errors\n");

Fan
> +			return -EINVAL;
> +		}
> +		wr_attrs[fru_id].ecs_config &= ~CXL_ECS_MODE_MASK;
> +		wr_attrs[fru_id].ecs_config |= FIELD_PREP(CXL_ECS_MODE_MASK,
> +							  params->mode);
> +		break;
> +	case CXL_ECS_PARAM_RESET_COUNTER:
> +		wr_attrs[fru_id].ecs_config &= ~CXL_ECS_RESET_COUNTER_MASK;
> +		wr_attrs[fru_id].ecs_config |= FIELD_PREP(CXL_ECS_RESET_COUNTER_MASK,
> +							  params->reset_counter);
> +		break;
> +	default:
> +		dev_err(dev, "Invalid CXL ECS parameter to set\n");
> +		return -EINVAL;
> +	}
> +
> +	ret = cxl_set_feature(cxlds, cxl_ecs_uuid, cxl_ecs_ctx->set_version,
> +			      wr_attrs, wr_data_size,
> +			      CXL_SET_FEAT_FLAG_DATA_SAVED_ACROSS_RESET);
> +	if (ret) {
> +		dev_err(dev, "CXL ECS set feature failed ret=%d\n", ret);
> +		return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +static int cxl_ecs_get_log_entry_type(struct device *dev, void *drv_data,
> +				      int fru_id, u32 *val)
> +{
> +	struct cxl_ecs_params params;
> +	int ret;
> +
> +	ret = cxl_mem_ecs_get_attrs(dev, drv_data, fru_id, &params);
> +	if (ret)
> +		return ret;
> +
> +	*val = params.log_entry_type;
> +
> +	return 0;
> +}
> +
> +static int cxl_ecs_set_log_entry_type(struct device *dev, void *drv_data,
> +				      int fru_id, u32 val)
> +{
> +	struct cxl_ecs_params params = {
> +		.log_entry_type = val,
> +	};
> +
> +	return cxl_mem_ecs_set_attrs(dev, drv_data, fru_id,
> +				     &params, CXL_ECS_PARAM_LOG_ENTRY_TYPE);
> +}
> +
> +static int cxl_ecs_get_log_entry_type_per_dram(struct device *dev, void *drv_data,
> +					       int fru_id, u32 *val)
> +{
> +	struct cxl_ecs_params params;
> +	int ret;
> +
> +	ret = cxl_mem_ecs_get_attrs(dev, drv_data, fru_id, &params);
> +	if (ret)
> +		return ret;
> +
> +	if (params.log_entry_type == ECS_LOG_ENTRY_TYPE_DRAM)
> +		*val = 1;
> +	else
> +		*val = 0;
> +
> +	return 0;
> +}
> +
> +static int cxl_ecs_get_log_entry_type_per_memory_media(struct device *dev,
> +						       void *drv_data,
> +						       int fru_id, u32 *val)
> +{
> +	struct cxl_ecs_params params;
> +	int ret;
> +
> +	ret = cxl_mem_ecs_get_attrs(dev, drv_data, fru_id, &params);
> +	if (ret)
> +		return ret;
> +
> +	if (params.log_entry_type == ECS_LOG_ENTRY_TYPE_MEM_MEDIA_FRU)
> +		*val = 1;
> +	else
> +		*val = 0;
> +
> +	return 0;
> +}
> +
> +static int cxl_ecs_get_mode(struct device *dev, void *drv_data,
> +			    int fru_id, u32 *val)
> +{
> +	struct cxl_ecs_params params;
> +	int ret;
> +
> +	ret = cxl_mem_ecs_get_attrs(dev, drv_data, fru_id, &params);
> +	if (ret)
> +		return ret;
> +
> +	*val = params.mode;
> +
> +	return 0;
> +}
> +
> +static int cxl_ecs_set_mode(struct device *dev, void *drv_data,
> +			    int fru_id, u32 val)
> +{
> +	struct cxl_ecs_params params = {
> +		.mode = val,
> +	};
> +
> +	return cxl_mem_ecs_set_attrs(dev, drv_data, fru_id,
> +				     &params, CXL_ECS_PARAM_MODE);
> +}
> +
> +static int cxl_ecs_get_mode_counts_rows(struct device *dev, void *drv_data,
> +					int fru_id, u32 *val)
> +{
> +	struct cxl_ecs_params params;
> +	int ret;
> +
> +	ret = cxl_mem_ecs_get_attrs(dev, drv_data, fru_id, &params);
> +	if (ret)
> +		return ret;
> +
> +	if (params.mode == ECS_MODE_COUNTS_ROWS)
> +		*val = 1;
> +	else
> +		*val = 0;
> +
> +	return 0;
> +}
> +
> +static int cxl_ecs_get_mode_counts_codewords(struct device *dev, void *drv_data,
> +					     int fru_id, u32 *val)
> +{
> +	struct cxl_ecs_params params;
> +	int ret;
> +
> +	ret = cxl_mem_ecs_get_attrs(dev, drv_data, fru_id, &params);
> +	if (ret)
> +		return ret;
> +
> +	if (params.mode == ECS_MODE_COUNTS_CODEWORDS)
> +		*val = 1;
> +	else
> +		*val = 0;
> +
> +	return 0;
> +}
> +
> +static int cxl_ecs_reset(struct device *dev, void *drv_data, int fru_id, u32 val)
> +{
> +	struct cxl_ecs_params params = {
> +		.reset_counter = val,
> +	};
> +
> +	return cxl_mem_ecs_set_attrs(dev, drv_data, fru_id,
> +				     &params, CXL_ECS_PARAM_RESET_COUNTER);
> +}
> +
> +static int cxl_ecs_get_threshold(struct device *dev, void *drv_data,
> +				 int fru_id, u32 *val)
> +{
> +	struct cxl_ecs_params params;
> +	int ret;
> +
> +	ret = cxl_mem_ecs_get_attrs(dev, drv_data, fru_id, &params);
> +	if (ret)
> +		return ret;
> +
> +	*val = params.threshold;
> +
> +	return 0;
> +}
> +
> +static int cxl_ecs_set_threshold(struct device *dev, void *drv_data,
> +				 int fru_id, u32 val)
> +{
> +	struct cxl_ecs_params params = {
> +		.threshold = val,
> +	};
> +
> +	return cxl_mem_ecs_set_attrs(dev, drv_data, fru_id,
> +				     &params, CXL_ECS_PARAM_THRESHOLD);
> +}
> +
> +static const struct edac_ecs_ops cxl_ecs_ops = {
> +	.get_log_entry_type = cxl_ecs_get_log_entry_type,
> +	.set_log_entry_type = cxl_ecs_set_log_entry_type,
> +	.get_log_entry_type_per_dram = cxl_ecs_get_log_entry_type_per_dram,
> +	.get_log_entry_type_per_memory_media =
> +				cxl_ecs_get_log_entry_type_per_memory_media,
> +	.get_mode = cxl_ecs_get_mode,
> +	.set_mode = cxl_ecs_set_mode,
> +	.get_mode_counts_codewords = cxl_ecs_get_mode_counts_codewords,
> +	.get_mode_counts_rows = cxl_ecs_get_mode_counts_rows,
> +	.reset = cxl_ecs_reset,
> +	.get_threshold = cxl_ecs_get_threshold,
> +	.set_threshold = cxl_ecs_set_threshold,
> +};
> +
>  int cxl_mem_ras_features_init(struct cxl_memdev *cxlmd, struct cxl_region *cxlr)
>  {
>  	struct edac_dev_feature ras_features[CXL_DEV_NUM_RAS_FEATURES];
> @@ -310,7 +709,9 @@ int cxl_mem_ras_features_init(struct cxl_memdev *cxlmd, struct cxl_region *cxlr)
>  	struct cxl_patrol_scrub_context *cxl_ps_ctx;
>  	struct cxl_feat_entry feat_entry;
>  	char cxl_dev_name[CXL_SCRUB_NAME_LEN];
> +	struct cxl_ecs_context *cxl_ecs_ctx;
>  	int rc, i, num_ras_features = 0;
> +	int num_media_frus;
>  
>  	if (cxlr) {
>  		struct cxl_region_params *p = &cxlr->params;
> @@ -366,6 +767,42 @@ int cxl_mem_ras_features_init(struct cxl_memdev *cxlmd, struct cxl_region *cxlr)
>  	ras_features[num_ras_features].ctx = cxl_ps_ctx;
>  	num_ras_features++;
>  
> +	if (!cxlr) {
> +		rc = cxl_get_supported_feature_entry(cxlds, &cxl_ecs_uuid,
> +						     &feat_entry);
> +		if (rc < 0)
> +			goto feat_register;
> +
> +		if (!(feat_entry.attr_flags & CXL_FEAT_ENTRY_FLAG_CHANGABLE))
> +			goto feat_register;
> +		num_media_frus = feat_entry.get_feat_size /
> +					sizeof(struct cxl_ecs_rd_attrs);
> +		if (!num_media_frus)
> +			goto feat_register;
> +
> +		cxl_ecs_ctx = devm_kzalloc(&cxlmd->dev, sizeof(*cxl_ecs_ctx),
> +					   GFP_KERNEL);
> +		if (!cxl_ecs_ctx)
> +			goto feat_register;
> +		*cxl_ecs_ctx = (struct cxl_ecs_context) {
> +			.get_feat_size = feat_entry.get_feat_size,
> +			.set_feat_size = feat_entry.set_feat_size,
> +			.get_version = feat_entry.get_feat_ver,
> +			.set_version = feat_entry.set_feat_ver,
> +			.set_effects = feat_entry.set_effects,
> +			.num_media_frus = num_media_frus,
> +			.cxlmd = cxlmd,
> +		};
> +
> +		ras_features[num_ras_features].ft_type = RAS_FEAT_ECS;
> +		ras_features[num_ras_features].ecs_ops = &cxl_ecs_ops;
> +		ras_features[num_ras_features].ctx = cxl_ecs_ctx;
> +		ras_features[num_ras_features].ecs_info.num_media_frus =
> +								num_media_frus;
> +		num_ras_features++;
> +	}
> +
> +feat_register:
>  	return edac_dev_register(&cxlmd->dev, cxl_dev_name, NULL,
>  				 num_ras_features, ras_features);
>  }
> -- 
> 2.34.1
> 

-- 
Fan Ni




[Index of Archives]     [Linux IBM ACPI]     [Linux Power Management]     [Linux Kernel]     [Linux Laptop]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Video 4 Linux]     [Device Mapper]     [Linux Resources]
  Powered by Linux