RE: [RFC PATCH v9 08/11] cxl/memscrub: Add CXL memory device ECS control feature

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Fan,

Thanks for the comments.
Sorry for the delay.

>-----Original Message-----
>From: fan <nifan.cxl@xxxxxxxxx>
>Sent: 19 July 2024 19:43
>To: Shiju Jose <shiju.jose@xxxxxxxxxx>
>Cc: linux-edac@xxxxxxxxxxxxxxx; linux-cxl@xxxxxxxxxxxxxxx; linux-
>acpi@xxxxxxxxxxxxxxx; linux-mm@xxxxxxxxx; linux-kernel@xxxxxxxxxxxxxxx;
>bp@xxxxxxxxx; tony.luck@xxxxxxxxx; rafael@xxxxxxxxxx; lenb@xxxxxxxxxx;
>mchehab@xxxxxxxxxx; dan.j.williams@xxxxxxxxx; dave@xxxxxxxxxxxx; Jonathan
>Cameron <jonathan.cameron@xxxxxxxxxx>; dave.jiang@xxxxxxxxx;
>alison.schofield@xxxxxxxxx; vishal.l.verma@xxxxxxxxx; ira.weiny@xxxxxxxxx;
>david@xxxxxxxxxx; Vilas.Sridharan@xxxxxxx; leo.duran@xxxxxxx;
>Yazen.Ghannam@xxxxxxx; rientjes@xxxxxxxxxx; jiaqiyan@xxxxxxxxxx;
>Jon.Grimm@xxxxxxx; dave.hansen@xxxxxxxxxxxxxxx;
>naoya.horiguchi@xxxxxxx; james.morse@xxxxxxx; jthoughton@xxxxxxxxxx;
>somasundaram.a@xxxxxxx; erdemaktas@xxxxxxxxxx; pgonda@xxxxxxxxxx;
>duenwen@xxxxxxxxxx; mike.malvestuto@xxxxxxxxx; gthelen@xxxxxxxxxx;
>wschwartz@xxxxxxxxxxxxxxxxxxx; dferguson@xxxxxxxxxxxxxxxxxxx;
>wbs@xxxxxxxxxxxxxxxxxxxxxx; nifan.cxl@xxxxxxxxx; tanxiaofei
><tanxiaofei@xxxxxxxxxx>; Zengtao (B) <prime.zeng@xxxxxxxxxxxxx>; Roberto
>Sassu <roberto.sassu@xxxxxxxxxx>; kangkang.shen@xxxxxxxxxxxxx;
>wanghuiqiang <wanghuiqiang@xxxxxxxxxx>; Linuxarm
><linuxarm@xxxxxxxxxx>
>Subject: Re: [RFC PATCH v9 08/11] cxl/memscrub: Add CXL memory device ECS
>control feature
>
>On Tue, Jul 16, 2024 at 04:03:32PM +0100, shiju.jose@xxxxxxxxxx wrote:
>> From: Shiju Jose <shiju.jose@xxxxxxxxxx>
>>
>> CXL spec 3.1 section 8.2.9.9.11.2 describes the DDR5 Error Check Scrub
>> (ECS) control feature.
>> The Error Check Scrub (ECS) is a feature defined in JEDEC DDR5 SDRAM
>> Specification (JESD79-5) and allows the DRAM to internally read,
>> correct single-bit errors, and write back corrected data bits to the
>> DRAM array while providing transparency to error counts.
>>
>> The ECS control allows the requester to change the log entry type, the
>> ECS threshold count provided that the request is within the definition
>> specified in DDR5 mode registers, change mode between codeword mode
>> and row count mode, and reset the ECS counter.
>>
>> Register with EDAC RAS control feature driver, which gets the ECS attr
>> descriptors from the EDAC ECS and expose sysfs ECS control attributes
>> to the userspace.
>> For example ECS control for the memory media FRU 0 in CXL mem0 device
>> is in /sys/bus/edac/devices/cxl_mem0/ecs_fru0/
>>
>> Note: The documentation can be added if necessary.
>>
>> Signed-off-by: Shiju Jose <shiju.jose@xxxxxxxxxx>
>> ---
>
>Some lines are too long. And some other comments inline.
Wil fix.
>
>>  drivers/cxl/core/memscrub.c | 429
>> ++++++++++++++++++++++++++++++++++++
>>  1 file changed, 429 insertions(+)
>>
>> diff --git a/drivers/cxl/core/memscrub.c b/drivers/cxl/core/memscrub.c
>> index 430f85b01f6c..9be230ea989a 100644
>> --- a/drivers/cxl/core/memscrub.c
>> +++ b/drivers/cxl/core/memscrub.c
>> @@ -351,13 +351,411 @@ static const struct edac_scrub_ops
>cxl_ps_scrub_ops = {
>>  	.cycle_in_hours_range = cxl_patrol_scrub_read_scrub_cycle_hrs_range,
>>  };
>>
>> +/* CXL DDR5 ECS control definitions */
>> +#define CXL_MEMDEV_ECS_GET_FEAT_VERSION	0x01
>> +#define CXL_MEMDEV_ECS_SET_FEAT_VERSION	0x01
>> +
>> +static const uuid_t cxl_ecs_uuid =
>> +	UUID_INIT(0xe5b13f22, 0x2328, 0x4a14, 0xb8, 0xba, 0xb9, 0x69, 0x1e,
>\
>> +		  0x89, 0x33, 0x86);
>> +
>> +struct cxl_ecs_context {
>> +	u16 num_media_frus;
>> +	u16 get_feat_size;
>> +	u16 set_feat_size;
>> +	struct cxl_memdev *cxlmd;
>> +};
>> +
>> +/**
>> + * struct cxl_ecs_params - CXL memory DDR5 ECS parameter data structure.
>> + * @log_entry_type: ECS log entry type, per DRAM or per memory media
>FRU.
>> + * @threshold: ECS threshold count per GB of memory cells.
>> + * @mode:	codeword/row count mode
>> + *		0 : ECS counts rows with errors
>> + *		1 : ECS counts codeword with errors
>> + * @reset_counter: [IN] reset ECC counter to default value.
>> + */
>> +struct cxl_ecs_params {
>> +	u8 log_entry_type;
>> +	u16 threshold;
>> +	u8 mode;
>
>An enum is defined below, why not directly use enum type here?
Will do.
>
>> +	bool reset_counter;
>> +};
>> +
>> +enum {
>> +	CXL_ECS_PARAM_LOG_ENTRY_TYPE,
>> +	CXL_ECS_PARAM_THRESHOLD,
>> +	CXL_ECS_PARAM_MODE,
>> +	CXL_ECS_PARAM_RESET_COUNTER,
>> +};
>> +
>> +#define	CXL_ECS_LOG_ENTRY_TYPE_MASK	GENMASK(1, 0)
>> +#define	CXL_ECS_REALTIME_REPORT_CAP_MASK	BIT(0)
>> +#define	CXL_ECS_THRESHOLD_COUNT_MASK	GENMASK(2, 0)
>> +#define	CXL_ECS_MODE_MASK	BIT(3)
>> +#define	CXL_ECS_RESET_COUNTER_MASK	BIT(4)
>> +
>> +static const u16 ecs_supp_threshold[] = { 0, 0, 0, 256, 1024, 4096 };
>> +
>> +enum {
>> +	ECS_LOG_ENTRY_TYPE_DRAM = 0x0,
>> +	ECS_LOG_ENTRY_TYPE_MEM_MEDIA_FRU = 0x1, };
>> +
>> +enum {
>> +	ECS_THRESHOLD_256 = 3,
>> +	ECS_THRESHOLD_1024 = 4,
>> +	ECS_THRESHOLD_4096 = 5,
>> +};
>> +
>> +enum {
>> +	ECS_MODE_COUNTS_ROWS = 0,
>> +	ECS_MODE_COUNTS_CODEWORDS = 1,
>> +};
>> +
>> +struct cxl_ecs_rd_attrs {
>> +	u8 ecs_log_cap;
>> +	u8 ecs_cap;
>> +	__le16 ecs_config;
>> +	u8 ecs_flags;
>> +}  __packed;
>> +
>> +struct cxl_ecs_wr_attrs {
>> +	u8 ecs_log_cap;
>> +	__le16 ecs_config;
>> +}  __packed;
>> +
>> +/* CXL DDR5 ECS control functions */
>> +static int cxl_mem_ecs_get_attrs(struct device *dev, void *drv_data, int
>fru_id,
>> +				 struct cxl_ecs_params *params)
>> +{
>> +	struct cxl_ecs_context *cxl_ecs_ctx = drv_data;
>> +	struct cxl_memdev *cxlmd = cxl_ecs_ctx->cxlmd;
>> +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>> +	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
>> +	size_t rd_data_size;
>> +	u8 threshold_index;
>> +	size_t data_size;
>> +
>> +	rd_data_size = cxl_ecs_ctx->get_feat_size;
>> +
>> +	struct cxl_ecs_rd_attrs *rd_attrs __free(kfree) =
>> +					kmalloc(rd_data_size, GFP_KERNEL);
>> +	if (!rd_attrs)
>> +		return -ENOMEM;
>> +
>> +	params->log_entry_type = 0;
>> +	params->threshold = 0;
>> +	params->mode = 0;
>> +	data_size = cxl_get_feature(mds, cxl_ecs_uuid, rd_attrs,
>> +				    rd_data_size,
>CXL_GET_FEAT_SEL_CURRENT_VALUE);
>> +	if (!data_size)
>> +		return -EIO;
>> +
>> +	params->log_entry_type =
>FIELD_GET(CXL_ECS_LOG_ENTRY_TYPE_MASK,
>> +					   rd_attrs[fru_id].ecs_log_cap);
>> +	threshold_index = FIELD_GET(CXL_ECS_THRESHOLD_COUNT_MASK,
>> +				    rd_attrs[fru_id].ecs_config);
>> +	params->threshold = ecs_supp_threshold[threshold_index];
>> +	params->mode = FIELD_GET(CXL_ECS_MODE_MASK,
>> +				 rd_attrs[fru_id].ecs_config);
>> +	return 0;
>> +}
>> +
>> +static int cxl_mem_ecs_set_attrs(struct device *dev, void *drv_data, int
>fru_id,
>> +				 struct cxl_ecs_params *params, u8
>param_type) {
>> +	struct cxl_ecs_context *cxl_ecs_ctx = drv_data;
>> +	struct cxl_memdev *cxlmd = cxl_ecs_ctx->cxlmd;
>> +	struct cxl_dev_state *cxlds = cxlmd->cxlds;
>> +	struct cxl_memdev_state *mds = to_cxl_memdev_state(cxlds);
>> +	size_t rd_data_size, wr_data_size;
>> +	u16 num_media_frus, count;
>> +	size_t data_size;
>> +	int ret;
>> +
>> +	num_media_frus = cxl_ecs_ctx->num_media_frus;
>> +	rd_data_size = cxl_ecs_ctx->get_feat_size;
>> +	wr_data_size = cxl_ecs_ctx->set_feat_size;
>> +	struct cxl_ecs_rd_attrs *rd_attrs __free(kfree) =
>> +				kmalloc(rd_data_size, GFP_KERNEL);
>> +	if (!rd_attrs)
>> +		return -ENOMEM;
>> +
>> +	data_size = cxl_get_feature(mds, cxl_ecs_uuid, rd_attrs,
>> +				    rd_data_size,
>CXL_GET_FEAT_SEL_CURRENT_VALUE);
>> +	if (!data_size)
>> +		return -EIO;
>> +	struct cxl_ecs_wr_attrs *wr_attrs __free(kfree) =
>> +					kmalloc(wr_data_size, GFP_KERNEL);
>> +	if (!wr_attrs)
>> +		return -ENOMEM;
>> +
>> +	/* Fill writable attributes from the current attributes read for all the
>media FRUs */
>> +	for (count = 0; count < num_media_frus; count++) {
>> +		wr_attrs[count].ecs_log_cap = rd_attrs[count].ecs_log_cap;
>> +		wr_attrs[count].ecs_config = rd_attrs[count].ecs_config;
>> +	}
>> +
>> +	/* Fill attribute to be set for the media FRU */
>> +	switch (param_type) {
>> +	case CXL_ECS_PARAM_LOG_ENTRY_TYPE:
>> +		if (params->log_entry_type != ECS_LOG_ENTRY_TYPE_DRAM
>&&
>> +		    params->log_entry_type !=
>ECS_LOG_ENTRY_TYPE_MEM_MEDIA_FRU) {
>> +			dev_err(dev,
>> +				"Invalid CXL ECS scrub log entry type(%d) to
>set\n",
>> +			       params->log_entry_type);
>> +			dev_err(dev,
>> +				"Log Entry Type 0: per DRAM  1: per Memory
>Media FRU\n");
>> +			return -EINVAL;
>> +		}
>> +		wr_attrs[fru_id].ecs_log_cap =
>FIELD_PREP(CXL_ECS_LOG_ENTRY_TYPE_MASK,
>> +							  params-
>>log_entry_type);
>> +		break;
>> +	case CXL_ECS_PARAM_THRESHOLD:
>> +		wr_attrs[fru_id].ecs_config &=
>~CXL_ECS_THRESHOLD_COUNT_MASK;
>> +		switch (params->threshold) {
>> +		case 256:
>> +			wr_attrs[fru_id].ecs_config |= FIELD_PREP(
>> +					CXL_ECS_THRESHOLD_COUNT_MASK,
>> +						ECS_THRESHOLD_256);
>> +			break;
>> +		case 1024:
>> +			wr_attrs[fru_id].ecs_config |= FIELD_PREP(
>> +
>	CXL_ECS_THRESHOLD_COUNT_MASK,
>> +						ECS_THRESHOLD_1024);
>> +			break;
>> +		case 4096:
>> +			wr_attrs[fru_id].ecs_config |= FIELD_PREP(
>> +
>	CXL_ECS_THRESHOLD_COUNT_MASK,
>> +						ECS_THRESHOLD_4096);
>> +			break;
>> +		default:
>> +			dev_err(dev,
>> +				"Invalid CXL ECS scrub threshold count(%d) to
>set\n",
>> +				params->threshold);
>> +			dev_err(dev,
>> +				"Supported scrub threshold count:
>256,1024,4096\n");
>> +			return -EINVAL;
>> +		}
>> +		break;
>> +	case CXL_ECS_PARAM_MODE:
>> +		if (params->mode != ECS_MODE_COUNTS_ROWS &&
>> +		    params->mode != ECS_MODE_COUNTS_CODEWORDS) {
>> +			dev_err(dev,
>> +				"Invalid CXL ECS scrub mode(%d) to set\n",
>> +				params->mode);
>> +			dev_err(dev,
>> +				"Mode 0: ECS counts rows with errors"
>> +				" 1: ECS counts codewords with errors\n");
>> +			return -EINVAL;
>> +		}
>> +		wr_attrs[fru_id].ecs_config &= ~CXL_ECS_MODE_MASK;
>> +		wr_attrs[fru_id].ecs_config |=
>FIELD_PREP(CXL_ECS_MODE_MASK,
>> +							  params->mode);
>> +		break;
>> +	case CXL_ECS_PARAM_RESET_COUNTER:
>> +		wr_attrs[fru_id].ecs_config &=
>~CXL_ECS_RESET_COUNTER_MASK;
>> +		wr_attrs[fru_id].ecs_config |=
>FIELD_PREP(CXL_ECS_RESET_COUNTER_MASK,
>> +							  params-
>>reset_counter);
>> +		break;
>> +	default:
>> +		dev_err(dev, "Invalid CXL ECS parameter to set\n");
>> +		return -EINVAL;
>> +	}
>> +
>> +	ret = cxl_set_feature(mds, cxl_ecs_uuid,
>CXL_MEMDEV_ECS_SET_FEAT_VERSION,
>> +			      wr_attrs, wr_data_size,
>> +
>CXL_SET_FEAT_FLAG_DATA_SAVED_ACROSS_RESET);
>> +	if (ret) {
>> +		dev_err(dev, "CXL ECS set feature failed ret=%d\n", ret);
>> +		return ret;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>> +static int cxl_ecs_get_log_entry_type(struct device *dev, void
>> +*drv_data, int fru_id, u64 *val) {
>> +	struct cxl_ecs_params params;
>> +	int ret;
>> +
>> +	ret = cxl_mem_ecs_get_attrs(dev, drv_data, fru_id, &params);
>> +	if (ret)
>> +		return ret;
>> +
>> +	*val = params.log_entry_type;
>> +
>> +	return 0;
>> +}
>> +
>> +static int cxl_ecs_set_log_entry_type(struct device *dev, void
>> +*drv_data, int fru_id, u64 val) {
>> +	struct cxl_ecs_params params = {
>> +		.log_entry_type = val,
>> +	};
>> +
>> +	return cxl_mem_ecs_set_attrs(dev, drv_data, fru_id, &params,
>> +CXL_ECS_PARAM_LOG_ENTRY_TYPE); }
>> +
>> +static int cxl_ecs_get_log_entry_type_per_dram(struct device *dev, void
>*drv_data,
>> +					       int fru_id, u64 *val)
>
>I may have missed something. We have cxl_ecs_get_log_entry_type, and what is
>cxl_ecs_get_log_entry_type_per_memory_media and
>cxl_ecs_get_log_entry_type_per_dram for?
Reason for adding these readonly attributes to avoid user need to check the spec to
interpret the value set or the supported options for ECS log type.

[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux