will be executed before it actually read the ID in DTR protocol.
Xilinx Versal tuning algorithm (as suggested by IP design team):
- Sweep all possible 127 taps to read the flash id for each tap.
- To consider a particular tap as valid make sure to get the correct id
for 10 times.
- Once valid window is found(window size >= 3), stop parsing the
remaining taps.
- Perform the above steps with extra dummy clock 1.
- Identify the maximum window size between extra dummy clock 0 and 1 and
configure the median of min and max valid tap of the corresponding window.
- If the maximum window size is with dummy clock 1 then all the subsequent
read operations should be performed with one extra dummy clock.
Signed-off-by: Sai Krishna Potthuri <sai.krishna.potthuri@xxxxxxx>
---
drivers/spi/spi-cadence-quadspi.c | 201 ++++++++++++++++++++++++++++++
1 file changed, 201 insertions(+)
diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c
index b55b763003f0..bc4395736def 100644
--- a/drivers/spi/spi-cadence-quadspi.c
+++ b/drivers/spi/spi-cadence-quadspi.c
@@ -93,6 +93,8 @@ struct cqspi_st {
bool slow_sram;
bool use_phy;
bool use_dqs;
+ bool clk_tuned;
+ u8 extra_dummy_clks;
};
struct cqspi_driver_platdata {
@@ -101,6 +103,7 @@ struct cqspi_driver_platdata {
int (*indirect_read_dma)(struct cqspi_flash_pdata *f_pdata,
u_char *rxbuf, loff_t from_addr, size_t n_rx);
u32 (*get_dma_status)(struct cqspi_st *cqspi);
+ int (*execute_tuning)(struct spi_mem *mem, const struct spi_mem_op *op);
};
/* Operation timeout value */
@@ -192,6 +195,7 @@ struct cqspi_driver_platdata {
#define CQSPI_REG_IRQSTATUS 0x40
#define CQSPI_REG_IRQMASK 0x44
+#define CQSPI_REG_VERSAL_ECO 0x48
#define CQSPI_REG_INDIRECTRD 0x60
#define CQSPI_REG_INDIRECTRD_START_MASK BIT(0)
@@ -238,6 +242,19 @@ struct cqspi_driver_platdata {
#define CQSPI_REG_POLLING_STATUS 0xB0
#define CQSPI_REG_POLLING_STATUS_DUMMY_LSB 16
+#define CQSPI_REG_VERSAL_PHY_CONFIG 0xB4
+#define CQSPI_REG_PHY_CONFIG_RESYNC_FLD_MASK BIT(31)
+#define CQSPI_REG_PHY_CONFIG_RESET_FLD_MASK BIT(30)
+#define CQSPI_REG_PHY_CONFIG_TX_DLL_DLY_LSB 16
+
+#define CQSPI_REG_VERSAL_PHY_MASTER_CTRL 0xB8
+#define CQSPI_REG_DLL_LOWER 0xBC
+#define CQSPI_REG_DLL_LOWER_LPBK_LOCK_MASK BIT(15)
+#define CQSPI_REG_DLL_LOWER_DLL_LOCK_MASK BIT(0)
+
+#define CQSPI_REG_VERSAL_DLL_OBSVBLE_UPPER 0xC0
+#define CQSPI_REG_DLL_UPPER_RX_FLD_MASK 0x7F
+
#define CQSPI_REG_OP_EXT_LOWER 0xE0
#define CQSPI_REG_OP_EXT_READ_LSB 24
#define CQSPI_REG_OP_EXT_WRITE_LSB 16
@@ -282,6 +299,12 @@ struct cqspi_driver_platdata {
#define CQSPI_DMA_UNALIGN 0x3
#define CQSPI_REG_VERSAL_DMA_VAL 0x602
+#define CQSPI_VERSAL_MAX_DLL_TAPS 127
+#define CQSPI_VERSAL_TX_TAP_MASTER 0x1E
+#define CQSPI_VERSAL_ECO_MIN_FREQ 120000000
+#define CQSPI_VERSAL_INIT_DLY 4
+#define CQSPI_VERSAL_RDID_MAX_CNT 10
+#define CQSPI_VERSAL_MIN_WINDOW_SZ 3
static int cqspi_wait_for_bit(void __iomem *reg, const u32 mask, bool clr)
{
@@ -540,6 +563,14 @@ static int cqspi_command_read(struct cqspi_flash_pdata *f_pdata,
if (dummy_clk > CQSPI_DUMMY_CLKS_MAX)
return -EOPNOTSUPP;
+ /*
+ * Based on the tuning, some controllers requires extra dummy clocks due to additional
+ * delays in read path. For such controllers add required extra dummy clocks for all
+ * subsequent read operations.
+ */
+ if (cqspi->extra_dummy_clks)
+ dummy_clk += cqspi->extra_dummy_clks;
+
if (dummy_clk)
reg |= (dummy_clk & CQSPI_REG_CMDCTRL_DUMMY_MASK)
<< CQSPI_REG_CMDCTRL_DUMMY_LSB;
@@ -662,6 +693,14 @@ static int cqspi_read_setup(struct cqspi_flash_pdata *f_pdata,
if (dummy_clk > CQSPI_DUMMY_CLKS_MAX)
return -EOPNOTSUPP;
+ /*
+ * Based on the tuning, some controllers requires extra dummy clocks due to additional
+ * delays in read path. For such controllers add required extra dummy clocks for all
+ * subsequent read operations.
+ */
+ if (cqspi->extra_dummy_clks)
+ dummy_clk += cqspi->extra_dummy_clks;
+
if (dummy_clk)
reg |= (dummy_clk & CQSPI_REG_RD_INSTR_DUMMY_MASK)
<< CQSPI_REG_RD_INSTR_DUMMY_LSB;
@@ -1056,6 +1095,157 @@ static int cqspi_indirect_write_execute(struct cqspi_flash_pdata *f_pdata,
return ret;
}
+static int cqspi_versal_execute_tuning(struct spi_mem *mem, const struct spi_mem_op *op)
+{
+ struct cqspi_st *cqspi = spi_master_get_devdata(mem->spi->master);
+ u8 dummy_incr, dummy_flag = 0, count, max_index, min_index;
+ u8 min_rxtap, max_rxtap, avg_index, max_tap, windowsize;
+ struct platform_device *pdev = cqspi->pdev;
+ struct cqspi_flash_pdata *f_pdata;
+ bool id_matched, rxtapfound;
+ u32 val, i, reg, txtap = 0;
+ s8 max_windowsize = -1;
+ int ret;
+
+ f_pdata = &cqspi->f_pdata[mem->spi->chip_select];
+
+ reg = readl(cqspi->iobase + CQSPI_REG_CONFIG);
+ reg &= CQSPI_REG_CONFIG_ENABLE_MASK;
+ if (!reg)
+ return 0;
+
+ if (cqspi->master_ref_clk_hz >= CQSPI_VERSAL_ECO_MIN_FREQ)
+ writel(1, cqspi->iobase + CQSPI_REG_VERSAL_ECO);
+
+ /* Drive DLL reset bit to low */
+ writel(0, cqspi->iobase + CQSPI_REG_VERSAL_PHY_CONFIG);
+
+ /* Set initial delay value */
+ writel(CQSPI_VERSAL_INIT_DLY, cqspi->iobase + CQSPI_REG_VERSAL_PHY_MASTER_CTRL);
+
+ /* Set DLL reset bit */
+ writel(CQSPI_REG_PHY_CONFIG_RESET_FLD_MASK,
+ cqspi->iobase + CQSPI_REG_VERSAL_PHY_CONFIG);
+
+ /* Check for loopback lock */
+ ret = cqspi_wait_for_bit(cqspi->iobase + CQSPI_REG_DLL_LOWER,
+ CQSPI_REG_DLL_LOWER_LPBK_LOCK_MASK, 0);
+ if (ret) {
+ dev_err(&pdev->dev, "Loopback lock bit error (%i)\n", ret);
+ return ret;
+ }
+
+ /* Re-synchronize slave DLLs */
+ writel(CQSPI_REG_PHY_CONFIG_RESET_FLD_MASK, cqspi->iobase + CQSPI_REG_VERSAL_PHY_CONFIG);
+ writel(CQSPI_REG_PHY_CONFIG_RESET_FLD_MASK | CQSPI_REG_PHY_CONFIG_RESYNC_FLD_MASK,
+ cqspi->iobase + CQSPI_REG_VERSAL_PHY_CONFIG);
+
+ txtap = CQSPI_VERSAL_TX_TAP_MASTER << CQSPI_REG_PHY_CONFIG_TX_DLL_DLY_LSB;
+ max_tap = CQSPI_VERSAL_MAX_DLL_TAPS;
+
+ for (dummy_incr = 0; dummy_incr <= 1; dummy_incr++) {
+ /* Update the extra dummy clocks required to read the ID */
+ cqspi->extra_dummy_clks = dummy_incr;
+ rxtapfound = false;
+ min_rxtap = 0;
+ max_rxtap = 0;
+ max_index = 0;
+ min_index = 0;
+
+ for (i = 0; i <= max_tap; i++) {
+ val = txtap | i | CQSPI_REG_PHY_CONFIG_RESET_FLD_MASK;
+ writel(val, cqspi->iobase + CQSPI_REG_VERSAL_PHY_CONFIG);
+ writel(val | CQSPI_REG_PHY_CONFIG_RESYNC_FLD_MASK,
+ cqspi->iobase + CQSPI_REG_VERSAL_PHY_CONFIG);
+ ret = cqspi_wait_for_bit(cqspi->iobase + CQSPI_REG_DLL_LOWER,
+ CQSPI_REG_DLL_LOWER_DLL_LOCK_MASK, 0);
+ if (ret)
+ return ret;
+
+ /*
+ * To consider the tap as valid, make sure to read the Flash ID
+ * 10 times(recommended by IP design team). This makes the algorithm more
+ * robust and gives more confidence about the valid tap, this will also
+ * avoid getting the correct id by a fluke with a particular tap.
+ * In any one of the iteration if the Flash ID is not matches with the
+ * device_id, then that particular tap should not be considered as valid.
+ */
+ count = 0;
+ do {
+ /* Execute the command and wait for the complete */
+ ret = cqspi_command_read(f_pdata, op);
+ if (ret < 0) {
+ dev_err(&pdev->dev,
+ "error %d reading JEDEC ID\n", ret);
+ return ret;
+ }
+
+ id_matched = true;
+ if (memcmp(mem->device_id, op->data.buf.in, op->data.nbytes)) {
+ id_matched = false;
+ break;
+ }
+ } while (id_matched && (count++ <= CQSPI_VERSAL_RDID_MAX_CNT));
+
+ if (id_matched) {
+ u8 current_rxtap = readl(cqspi->iobase +
+ CQSPI_REG_VERSAL_DLL_OBSVBLE_UPPER) &
+ CQSPI_REG_DLL_UPPER_RX_FLD_MASK;
+
+ if (!rxtapfound) {
+ min_rxtap = current_rxtap;
+ min_index = i;
+ rxtapfound = true;
+ }
+
+ max_rxtap = current_rxtap;
+ max_index = i;
+ }
+
+ if (!rxtapfound)
+ continue;
+
+ /* If ID doesn't match or reach the max tap, calculate the window size */
+ if (!id_matched || i == max_tap) {
+ windowsize = max_rxtap - min_rxtap + 1;
+ if (windowsize > max_windowsize) {
+ dummy_flag = dummy_incr;
+ max_windowsize = windowsize;
+ avg_index = (max_index + min_index) / 2;
+ }
+
+ rxtapfound = false;
+ if (windowsize >= CQSPI_VERSAL_MIN_WINDOW_SZ)
+ break;
+ }
+ }
+ }
+
+ /*
+ * Look for minimum window size to ensure reliable data sampling even during drifts due
+ * to PVT, so that a drift either towards left or right can be well addressed.
+ */
+ if (max_windowsize < CQSPI_VERSAL_MIN_WINDOW_SZ)
+ return -EINVAL;
+
+ /* Update the extra dummy clocks based on the maximum window size */
+ cqspi->extra_dummy_clks = dummy_flag;
+
+ val = txtap | avg_index | CQSPI_REG_PHY_CONFIG_RESET_FLD_MASK;
+ writel(val, cqspi->iobase + CQSPI_REG_VERSAL_PHY_CONFIG);
+ writel((CQSPI_REG_PHY_CONFIG_RESYNC_FLD_MASK | val),
+ cqspi->iobase + CQSPI_REG_VERSAL_PHY_CONFIG);
+
+ ret = cqspi_wait_for_bit(cqspi->iobase + CQSPI_REG_DLL_LOWER,
+ CQSPI_REG_DLL_LOWER_DLL_LOCK_MASK, 0);
+ if (ret)
+ return ret;
+
+ cqspi->clk_tuned = true;
+
+ return 0;
+}
+
static void cqspi_chipselect(struct cqspi_flash_pdata *f_pdata)
{
struct cqspi_st *cqspi = f_pdata->cqspi;
@@ -1345,11 +1535,21 @@ static ssize_t cqspi_read(struct cqspi_flash_pdata *f_pdata,
static int cqspi_mem_process(struct spi_mem *mem, const struct spi_mem_op *op)
{
struct cqspi_st *cqspi = spi_master_get_devdata(mem->spi->master);
+ const struct cqspi_driver_platdata *ddata;
+ struct device *dev = &cqspi->pdev->dev;
struct cqspi_flash_pdata *f_pdata;
+ ddata = of_device_get_match_data(dev);
f_pdata = &cqspi->f_pdata[mem->spi->chip_select];
cqspi_configure(f_pdata, mem->spi->max_speed_hz);
+ if (op->cmd.dtr && ddata && ddata->execute_tuning && !cqspi->clk_tuned) {
+ int ret = ddata->execute_tuning(mem, op);
+
+ if (ret)
+ return ret;
+ }
+
if (op->data.dir == SPI_MEM_DATA_IN && op->data.buf.in) {
if (!op->addr.nbytes)
return cqspi_command_read(f_pdata, op);
@@ -1846,6 +2046,7 @@ static const struct cqspi_driver_platdata versal_ospi = {
| CQSPI_HAS_PHY | CQSPI_HAS_DQS,
.indirect_read_dma = cqspi_versal_indirect_read_dma,
.get_dma_status = cqspi_get_versal_dma_status,
+ .execute_tuning = cqspi_versal_execute_tuning,
};
static const struct of_device_id cqspi_dt_ids[] = {