Re: [PATCH 07/10] mci: add support for discarding write blocks

Sascha Hauer <s.hauer@xxxxxxxxxxxxxx> · Tue, 30 Jul 2024 12:05:00 +0200

On Tue, Jul 30, 2024 at 09:19:26AM +0200, Ahmad Fatoum wrote:
>  /**
>   * Read one or several block(s) of data from the card
>   * @param mci MCI instance
> @@ -773,6 +850,49 @@ static int sd_switch(struct mci *mci, unsigned mode, unsigned group,
>  	return mci_send_cmd(mci, &cmd, &data);
>  }
>  
> +static int sd_read_ssr(struct mci *mci)
> +{
> +	static const unsigned int sd_au_size[] = {
> +		0,		SZ_16K / 512,		SZ_32K / 512,
> +		SZ_64K / 512,	SZ_128K / 512,		SZ_256K / 512,
> +		SZ_512K / 512,	SZ_1M / 512,		SZ_2M / 512,
> +		SZ_4M / 512,	SZ_8M / 512,		(SZ_8M + SZ_4M) / 512,
> +		SZ_16M / 512,	(SZ_16M + SZ_8M) / 512,	SZ_32M / 512,
> +		SZ_64M / 512,
> +	};
> +	__be32 *ssr;
> +	int err;
> +	unsigned int au, eo, et, es;
> +
> +	if (!IS_ENABLED(CONFIG_MCI_ERASE))
> +		return -ENOSYS;

I think we settled on using -EOPNOTSUPP in this case.

> +static unsigned int mmc_align_erase_size(struct mci *card,
> +					 sector_t *from,
> +					 sector_t *to,
> +					 blkcnt_t nr)
> +{
> +	unsigned int from_new = *from, to_new, nr_new = nr, rem;
> +
> +	/*
> +	 * When the 'card->erase_size' is power of 2, we can use round_up/down()
> +	 * to align the erase size efficiently.
> +	 */
> +	if (is_power_of_2(card->erase_grp_size)) {
> +		unsigned int temp = from_new;
> +
> +		from_new = round_up(temp, card->erase_grp_size);
> +		rem = from_new - temp;
> +
> +		if (nr_new > rem)
> +			nr_new -= rem;
> +		else
> +			return 0;
> +
> +		nr_new = round_down(nr_new, card->erase_grp_size);
> +	} else {
> +		rem = from_new % card->erase_grp_size;
> +		if (rem) {
> +			rem = card->erase_grp_size - rem;
> +			from_new += rem;
> +			if (nr_new > rem)
> +				nr_new -= rem;
> +			else
> +				return 0;
> +		}
> +
> +		rem = nr_new % card->erase_grp_size;
> +		if (rem)
> +			nr_new -= rem;
> +	}
> +
> +	if (nr_new == 0)
> +		return 0;
> +
> +	to_new = from_new + nr_new;
> +
> +	if (*to != to_new || *from != from_new)
> +		dev_warn(&card->dev, "Erase range changed to [0x%x-0x%x] because of %u sector erase group\n",
> +			 from_new, to_new, card->erase_grp_size);
> +
> +	*to = to_new;
> +	*from = from_new;
> +
> +	return nr_new;
> +}
> +
> +/**
> + * Erase a memory region
> + * @param blk All info about the block device we need
> + * @param block first block to erase
> + * @param num_blocks Number of blocks to erase
> + * @return 0 on success, anything else on failure
> + *
> + */
> +static int mci_sd_erase(struct block_device *blk, sector_t from,
> +			blkcnt_t blkcnt)
> +{
> +	struct mci_part *part = container_of(blk, struct mci_part, blk);
> +	struct mci *mci = part->mci;
> +	sector_t i = 0;
> +	unsigned arg;
> +	sector_t to = from + blkcnt;
> +	int rc;
> +
> +	mci_blk_part_switch(part);
> +
> +	rc = mci_sd_check_write(mci, "Erase", from, blkcnt);
> +	if (rc)
> +		return rc;
> +
> +	if (!mci->erase_grp_size)
> +		return -EOPNOTSUPP;
> +
> +	if (mci->can_trim) {
> +		arg = MMC_TRIM_ARG;
> +	} else {
> +		/* We don't use discard, as it doesn't guarantee a fixed value */
> +		arg = MMC_ERASE_ARG;
> +		blkcnt = mmc_align_erase_size(mci, &from, &to, blkcnt);
> +	}
> +
> +	if (blkcnt == 0)
> +		return 0;
> +
> +	if (to <= from)
> +		return -EINVAL;

When mmc_align_erase_size() is not called then we cannot arrive here
as we already returned in the if (blkcnt == 0) check above.
When mmc_align_erase_size() is called and this test triggers then it
only reveals a bug in mmc_align_erase_size().

I think this test should go away.

> +
> +	/* 'from' and 'to' are inclusive */
> +	to -= 1;
> +
> +	while (i < blkcnt) {
> +		sector_t blk_r;
> +
> +		/* TODO: While it's possible to clear many erase groups at once
> +		 * and it greatly improves throughput, drivers need adjustment:
> +		 *
> +		 * Many drivers hardcode a maximal wait time before aborting
> +		 * the wait for R1b and returning -ETIMEDOUT. With long
> +		 * erases/trims, we are bound to run into this timeout, so for now
> +		 * we just split into suifficiently small erases that are unlikely
> +		 * to trigger the time.
> +		 *
> +		 * What Linux does and what we should be doing in barebox is:
> +		 *
> +		 *  - add a struct mci_cmd::busy_timeout member that drivers should
> +		 *    use instead of hardcoding their own timeout delay. The busy
> +		 *    timeout length can be calculated by the MCI core after
> +		 *    consulting the appropriate CSD/EXT_CSD/SSR registers.
> +		 *
> +		 *  - add a struct mci_host::max_busy_timeout member, where drivers
> +		 *    can indicate the maximum timeout they are able to support.
> +		 *    The MCI core will never set a busy_timeout that exceeds this
> +		 *    value.
> +		 *
> +		 *  Example Samsung eMMC 8GTF4:
> +		 *
> +		 *    time erase /dev/mmc2.part_of_512m # 1024 trims
> +		 *    time: 2849ms
> +		 *
> +		 *    time erase /dev/mmc2.part_of_512m # single trim
> +		 *    time: 56ms
> +		 */
> +
> +		if (IS_SD(mci) && mci->ssr.au) {
> +			blk_r = ((blkcnt - i) > mci->ssr.au) ?
> +				mci->ssr.au : (blkcnt - i);
> +		} else {
> +			blk_r = ((blkcnt - i) > mci->erase_grp_size) ?
> +				mci->erase_grp_size : (blkcnt - i);
> +		}
> +
> +		rc =  mci_block_erase(mci, from, to, arg);

You say you split up the whole erase into sufficiently small erases, but
'from' and 'to' are never changed in this loop and you seem to erase
the whole area multiple times.

> +		if (rc)
> +			break;
> +
> +		/* Waiting for the ready status */
> +		rc = mci_poll_until_ready(mci, 1000 /* ms */);
> +		if (rc)
> +			break;
> +
> +		i += blk_r;
> +	}
> +
> +	return i == blkcnt ? 0 : rc;
> +}

Sascha

-- 
Pengutronix e.K.                           |                             |
Steuerwalder Str. 21                       | http://www.pengutronix.de/  |
31137 Hildesheim, Germany                  | Phone: +49-5121-206917-0    |
Amtsgericht Hildesheim, HRA 2686           | Fax:   +49-5121-206917-5555 |