On 10/14/2024 3:52 PM, Bjorn Andersson wrote:
On Wed, Sep 18, 2024 at 09:52:54AM -0600, Jeffrey Hugo wrote:
The Sahara protocol has a crashdump functionality. In the hello
exchange, the device can advertise it has a memory dump available for
the host to collect. Instead of the device making requests of the host,
the host requests data from the device which can be later analyzed.
Implement this functionality and utilize the devcoredump framework for
handing the dump over to userspace.
Similar to how firmware loading in Sahara involves multiple files,
crashdump can consist of multiple files for different parts of the dump.
Structure these into a single buffer that userspace can parse and
extract the original files from.
Looks good. No technical concerns but a few stylistic comments below.
Reviewed-by: Carl Vanderlip <quic_carlv@xxxxxxxxxxx>
Signed-off-by: Jeffrey Hugo <quic_jhugo@xxxxxxxxxxx>
---
drivers/accel/qaic/sahara.c | 394 +++++++++++++++++++++++++++++++++++-
1 file changed, 387 insertions(+), 7 deletions(-)
diff --git a/drivers/accel/qaic/sahara.c b/drivers/accel/qaic/sahara.c
index bf94bbab6be5..ed792b615ea2 100644
--- a/drivers/accel/qaic/sahara.c
+++ b/drivers/accel/qaic/sahara.c
@@ -2,6 +2,7 @@
/* Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved. */
+#include <linux/devcoredump.h>
#include <linux/firmware.h>
#include <linux/limits.h>
#include <linux/mhi.h>
@@ -9,6 +10,7 @@
#include <linux/mod_devicetable.h>
#include <linux/overflow.h>
#include <linux/types.h>
+#include <linux/vmalloc.h>
#include <linux/workqueue.h>
#include "sahara.h"
@@ -36,12 +38,14 @@
#define SAHARA_PACKET_MAX_SIZE 0xffffU /* MHI_MAX_MTU */
#define SAHARA_TRANSFER_MAX_SIZE 0x80000
+#define SAHARA_READ_MAX_SIZE 0xfff0U /* Avoid unaligned requests */
#define SAHARA_NUM_TX_BUF DIV_ROUND_UP(SAHARA_TRANSFER_MAX_SIZE,\
SAHARA_PACKET_MAX_SIZE)
#define SAHARA_IMAGE_ID_NONE U32_MAX
#define SAHARA_VERSION 2
#define SAHARA_SUCCESS 0
+#define SAHARA_TABLE_ENTRY_STR_LEN 20
#define SAHARA_MODE_IMAGE_TX_PENDING 0x0
#define SAHARA_MODE_IMAGE_TX_COMPLETE 0x1
@@ -53,6 +57,8 @@
#define SAHARA_END_OF_IMAGE_LENGTH 0x10
#define SAHARA_DONE_LENGTH 0x8
#define SAHARA_RESET_LENGTH 0x8
+#define SAHARA_MEM_DEBUG64_LENGTH 0x18
+#define SAHARA_MEM_READ64_LENGTH 0x18
struct sahara_packet {
__le32 cmd;
@@ -80,18 +86,95 @@ struct sahara_packet {
__le32 image;
__le32 status;
} end_of_image;
+ struct {
+ __le64 table_address;
+ __le64 table_length;
+ } memory_debug64;
+ struct {
+ __le64 memory_address;
+ __le64 memory_length;
+ } memory_read64;
};
};
+struct sahara_debug_table_entry64 {
+ __le64 type;
+ __le64 address;
+ __le64 length;
+ char description[SAHARA_TABLE_ENTRY_STR_LEN];
+ char filename[SAHARA_TABLE_ENTRY_STR_LEN];
+};
+
+struct sahara_dump_table_entry {
+ u64 type;
+ u64 address;
+ u64 length;
+ char description[SAHARA_TABLE_ENTRY_STR_LEN];
+ char filename[SAHARA_TABLE_ENTRY_STR_LEN];
+};
+
+#define SAHARA_DUMP_V1_MAGIC 0x1234567890abcdef
+#define SAHARA_DUMP_V1_VER 1
+struct sahara_memory_dump_meta_v1 {
+ u64 magic;
+ u64 version;
+ u64 dump_size;
+ u64 table_size;
+};
+
+/*
+ * Layout of crashdump
Please clarify that this is the structure of the dump you expose to the
user, not what comes out of the device.
Will do.
+ * +------------------------------------------+
+ * | Crashdump Meta structure |
+ * | type: struct sahara_memory_dump_meta_v1 |
+ * +------------------------------------------+
+ * | Crashdump Table |
+ * | type: array of struct |
+ * | sahara_dump_table_entry |
+ * | |
+ * | |
+ * +------------------------------------------+
+ * | Crashdump |
+ * | |
+ * | |
+ * | |
+ * | |
+ * | |
+ * +------------------------------------------+
+ *
+ * First is the metadata header. Userspace can use the magic number to verify
+ * the content type, and then check the version for the rest of the format.
+ * New versions should keep the magic number location/value, and version
+ * location, but increment the version value.
+ *
+ * For v1, the metadata lists the size of the entire dump (header + table +
+ * dump) and the size of the table. Then the dump image table, which describes
+ * the contents of the dump. Finally all the images are listed in order, with
+ * no deadspace in between. Userspace can use the sizes listed in the image
+ * table to reconstruct the individual images.
+ */
+
struct sahara_context {
struct sahara_packet *tx[SAHARA_NUM_TX_BUF];
struct sahara_packet *rx;
- struct work_struct work;
+ struct work_struct fw_work;
+ struct work_struct dump_work;
struct mhi_device *mhi_dev;
const char **image_table;
u32 table_size;
u32 active_image_id;
const struct firmware *firmware;
+ u64 dump_table_address;
+ u64 dump_table_length;
+ size_t rx_size;
+ size_t rx_size_requested;
+ void *mem_dump;
+ size_t mem_dump_sz;
+ struct sahara_dump_table_entry *dump_image;
+ u64 dump_image_offset;
+ void *mem_dump_freespace;
+ u64 dump_images_left;
That's a lot of images; and it's just a natural number. How about
"unsigned int" instead to convey that the size "doesn't matter"?
Hmm, this is derived from a 64-bit value that comes from the device. If
we downgrade this to uint (32-bit) we'd need to do some overflow
checking. Having a matched type and not worrying about the conversion
makes me feel better.
You still prefer uint?
+ bool is_mem_dump_mode;
};
static const char *aic100_image_table[] = {
@@ -153,6 +236,8 @@ static void sahara_send_reset(struct sahara_context *context)
{
int ret;
+ context->is_mem_dump_mode = false;
+
context->tx[0]->cmd = cpu_to_le32(SAHARA_RESET_CMD);
context->tx[0]->length = cpu_to_le32(SAHARA_RESET_LENGTH);
@@ -186,7 +271,8 @@ static void sahara_hello(struct sahara_context *context)
}
if (le32_to_cpu(context->rx->hello.mode) != SAHARA_MODE_IMAGE_TX_PENDING &&
- le32_to_cpu(context->rx->hello.mode) != SAHARA_MODE_IMAGE_TX_COMPLETE) {
+ le32_to_cpu(context->rx->hello.mode) != SAHARA_MODE_IMAGE_TX_COMPLETE &&
+ le32_to_cpu(context->rx->hello.mode) != SAHARA_MODE_MEMORY_DEBUG) {
dev_err(&context->mhi_dev->dev, "Unsupported hello packet - mode %d\n",
le32_to_cpu(context->rx->hello.mode));
return;
@@ -320,9 +406,70 @@ static void sahara_end_of_image(struct sahara_context *context)
dev_dbg(&context->mhi_dev->dev, "Unable to send done response %d\n", ret);
}
+static void sahara_memory_debug64(struct sahara_context *context)
+{
+ int ret;
+
+ dev_dbg(&context->mhi_dev->dev,
+ "MEMORY DEBUG64 cmd received. length:%d table_address:%#llx table_length:%#llx\n",
+ le32_to_cpu(context->rx->length),
+ le64_to_cpu(context->rx->memory_debug64.table_address),
+ le64_to_cpu(context->rx->memory_debug64.table_length));
+
+ if (le32_to_cpu(context->rx->length) != SAHARA_MEM_DEBUG64_LENGTH) {
+ dev_err(&context->mhi_dev->dev, "Malformed memory debug64 packet - length %d\n",
+ le32_to_cpu(context->rx->length));
Any particular reason why you choose not to attempt a reset in these two
error cases?
Undefined behavior. The spec doesn't address this condition - I guess
the assumption is that the device provides valid values.
The spec places limits on reset -
"The host sends a reset packet to reset the target. The target services
a reset request only if it is in a state where reset requests are valid.
If the target receives an invalid reset request, the target sends an
error in an end of image transfer packet"
From there the spec doesn't clarify how to proceed.
So, it seems possible that we send a reset here, the device rejects it,
we get an error back, and cannot really proceed, which feels like a
"back to square 1" situation. Less complex to not send a reset and end
up in the same place.
+ return;
+ }
+
+ context->dump_table_address = le64_to_cpu(context->rx->memory_debug64.table_address);
+ context->dump_table_length = le64_to_cpu(context->rx->memory_debug64.table_length);
+
+ if (context->dump_table_length % sizeof(struct sahara_debug_table_entry64) != 0 ||
+ !context->dump_table_length) {
+ dev_err(&context->mhi_dev->dev, "Malformed memory debug64 packet - table length %lld\n",
+ context->dump_table_length);
+ return;
+ }
+
+ /*
+ * From this point, the protocol flips. We make memory_read requests to
+ * the device, and the device responds with the raw data. If the device
+ * has an error, it will send an End of Image command. First we need to
+ * request the memory dump table so that we know where all the pieces
+ * of the dump are that we can consume.
+ */
+
+ context->is_mem_dump_mode = true;
+
+ /*
+ * Assume that the table is smaller than our MTU so that we can read it
+ * in one shot. The spec does not put an upper limit on the table, but
+ * no known device will exceed this.
+ */
+ if (context->dump_table_length > SAHARA_PACKET_MAX_SIZE) {
+ dev_err(&context->mhi_dev->dev, "Memory dump table length %lld exceeds supported size. Discarding dump\n",
+ context->dump_table_length);
+ sahara_send_reset(context);
+ return;
+ }
+
+ context->tx[0]->cmd = cpu_to_le32(SAHARA_MEM_READ64_CMD);
+ context->tx[0]->length = cpu_to_le32(SAHARA_MEM_READ64_LENGTH);
+ context->tx[0]->memory_read64.memory_address = context->rx->memory_debug64.table_address;
+ context->tx[0]->memory_read64.memory_length = context->rx->memory_debug64.table_length;
I think it would be easier to follow if you passed dump_table_address
and dump_table_length in here, rather than their __le64 source version.
Will do.
+
+ context->rx_size_requested = context->dump_table_length;
+
+ ret = mhi_queue_buf(context->mhi_dev, DMA_TO_DEVICE, context->tx[0],
+ SAHARA_MEM_READ64_LENGTH, MHI_EOT);
+ if (ret)
+ dev_err(&context->mhi_dev->dev, "Unable to send read for dump table %d\n", ret);
+}
+
static void sahara_processing(struct work_struct *work)
{
- struct sahara_context *context = container_of(work, struct sahara_context, work);
+ struct sahara_context *context = container_of(work, struct sahara_context, fw_work);
int ret;
switch (le32_to_cpu(context->rx->cmd)) {
@@ -338,6 +485,12 @@ static void sahara_processing(struct work_struct *work)
case SAHARA_DONE_RESP_CMD:
/* Intentional do nothing as we don't need to exit an app */
break;
+ case SAHARA_RESET_RESP_CMD:
+ /* Intentional do nothing as we don't need to exit an app */
For this patch I don't have any concern, but should we do something to
track that we should not handle any further requests?
The spec says that the device will (warm) reset after sending this. It
is actually a bit of a race condition as the AIC100 implementation will
send this packet, and then force a watchdog bite to reset. If that
watchdog processing is very quick, we won't even see this.
Another spec ambigiuity on what happens if the device doesn't reset
after this command. Tracking this feels like complexity for little gain.
+ break;
+ case SAHARA_MEM_DEBUG64_CMD:
+ sahara_memory_debug64(context);
+ break;
default:
dev_err(&context->mhi_dev->dev, "Unknown command %d\n",
le32_to_cpu(context->rx->cmd));
@@ -350,6 +503,223 @@ static void sahara_processing(struct work_struct *work)
dev_err(&context->mhi_dev->dev, "Unable to requeue rx buf %d\n", ret);
}
+static void sahara_parse_dump_table(struct sahara_context *context)
+{
+ struct sahara_dump_table_entry *image_out_table_entry;
+ struct sahara_debug_table_entry64 *dev_table_entry;
+ struct sahara_memory_dump_meta_v1 *dump_meta;
+ u64 table_nents;
+ u64 dump_length;
+ int ret;
+ u64 i;
+
+ table_nents = context->dump_table_length / sizeof(*dev_table_entry);
+ context->dump_images_left = table_nents;
+ dump_length = 0;
+
+ for (i = 0; i < table_nents; ++i) {
+ dev_table_entry = (struct sahara_debug_table_entry64 *)(context->rx) + i;
+ /* Do not trust the device, ensure the strings are terminated */
+ dev_table_entry->description[SAHARA_TABLE_ENTRY_STR_LEN - 1] = 0;
+ dev_table_entry->filename[SAHARA_TABLE_ENTRY_STR_LEN - 1] = 0;
+
+ dump_length = size_add(dump_length, le64_to_cpu(dev_table_entry->length));
+ if (dump_length == SIZE_MAX) {
+ /* Discard the dump */
+ sahara_send_reset(context);
+ return;
+ }
+
+ dev_dbg(&context->mhi_dev->dev,
+ "Memory dump table entry %lld type: %lld address: %#llx length: %#llx description: \"%s\" filename \"%s\"\n",
+ i,
+ le64_to_cpu(dev_table_entry->type),
+ le64_to_cpu(dev_table_entry->address),
+ le64_to_cpu(dev_table_entry->length),
+ dev_table_entry->description,
+ dev_table_entry->filename);
+ }
+
+ dump_length = size_add(dump_length, sizeof(*dump_meta));
+ if (dump_length == SIZE_MAX) {
+ /* Discard the dump */
+ sahara_send_reset(context);
+ return;
+ }
+ dump_length = size_add(dump_length, size_mul(sizeof(*image_out_table_entry), table_nents));
+ if (dump_length == SIZE_MAX) {
+ /* Discard the dump */
+ sahara_send_reset(context);
+ return;
+ }
+
+ context->mem_dump_sz = dump_length;
+ context->mem_dump = vzalloc(dump_length);
+ if (!context->mem_dump) {
+ /* Discard the dump */
+ sahara_send_reset(context);
+ return;
+ }
+
+ /* Populate the dump metadata and table for userspace */
+ dump_meta = context->mem_dump;
+ dump_meta->magic = SAHARA_DUMP_V1_MAGIC;
+ dump_meta->version = SAHARA_DUMP_V1_VER;
+ dump_meta->dump_size = dump_length;
+ dump_meta->table_size = context->dump_table_length;
+
+ image_out_table_entry = (struct sahara_dump_table_entry *)(context->mem_dump +
+ sizeof(*dump_meta));
Isn't this cast unnecessary?
I thought there was a compiler warning, but I'm not seeing it. Will fix.
+ dev_table_entry = (struct sahara_debug_table_entry64 *)(context->rx);
+ for (i = 0; i < table_nents; ++i, ++image_out_table_entry, ++dev_table_entry) {
+ image_out_table_entry->type = le64_to_cpu(dev_table_entry->type);
+ image_out_table_entry->address = le64_to_cpu(dev_table_entry->address);
+ image_out_table_entry->length = le64_to_cpu(dev_table_entry->length);
+ strscpy(image_out_table_entry->description, dev_table_entry->description,
+ SAHARA_TABLE_ENTRY_STR_LEN);
+ strscpy(image_out_table_entry->filename,
+ dev_table_entry->filename,
+ SAHARA_TABLE_ENTRY_STR_LEN);
+ }
+
+ context->mem_dump_freespace = image_out_table_entry;
+
+ /* Done parsing the table, switch to image dump mode */
+ context->dump_table_length = 0;
+
+ /* Request the first chunk of the first image */
+ context->dump_image = (struct sahara_dump_table_entry *)(context->mem_dump +
+ sizeof(*dump_meta));
I would have preferred to see this (and above) written such that it's
explicitly clear that you're filling out an array of entries and then
point this to the first entry in that array.
I'm not sure I understand what you would like to see here. Can you
perhaps give an example?
+ dump_length = context->dump_image->length > SAHARA_READ_MAX_SIZE ?
+ SAHARA_READ_MAX_SIZE : context->dump_image->length;
+ /* Avoid requesting EOI sized data so that we can identify errors */
+ if (dump_length == SAHARA_END_OF_IMAGE_LENGTH)
+ dump_length = SAHARA_END_OF_IMAGE_LENGTH / 2;
+
+ context->dump_image_offset = dump_length;
+
+ context->tx[0]->cmd = cpu_to_le32(SAHARA_MEM_READ64_CMD);
+ context->tx[0]->length = cpu_to_le32(SAHARA_MEM_READ64_LENGTH);
+ context->tx[0]->memory_read64.memory_address = cpu_to_le64(context->dump_image->address);
+ context->tx[0]->memory_read64.memory_length = cpu_to_le64(dump_length);
+
+ context->rx_size_requested = dump_length;
+
+ ret = mhi_queue_buf(context->mhi_dev, DMA_TO_DEVICE, context->tx[0],
+ SAHARA_MEM_READ64_LENGTH, MHI_EOT);
+ if (ret)
+ dev_err(&context->mhi_dev->dev, "Unable to send read for dump content %d\n", ret);
+}
+
+static void sahara_parse_dump_image(struct sahara_context *context)
+{
+ u64 dump_length;
+ int ret;
+
+ memcpy(context->mem_dump_freespace, context->rx, context->rx_size);
+ context->mem_dump_freespace += context->rx_size;
+
+ if (context->dump_image_offset >= context->dump_image->length) {
+ /* Need to move to next image */
+ context->dump_image++;
+ context->dump_images_left--;
+ context->dump_image_offset = 0;
+
+ if (!context->dump_images_left) {
+ /* Dump done */
+ dev_coredumpv(context->mhi_dev->mhi_cntrl->cntrl_dev,
+ context->mem_dump,
+ context->mem_dump_sz,
+ GFP_KERNEL);
+ context->mem_dump = NULL;
+ sahara_send_reset(context);
+ return;
+ }
+ }
+
+ /* Get next image chunk */
+ dump_length = context->dump_image->length - context->dump_image_offset;
+ dump_length = dump_length > SAHARA_READ_MAX_SIZE ? SAHARA_READ_MAX_SIZE : dump_length;
This would be cleaner to write as:
if (dump_length > SAHARA_READ_MAX_SIZE)
dump_length = SAHARA_READ_MAX_SIZE;
Or make the line above min(..., SAHARA_READ_MAX_SIZE);
Doh, min() would be great here. Will fix.
+ /* Avoid requesting EOI sized data so that we can identify errors */
+ if (dump_length == SAHARA_END_OF_IMAGE_LENGTH)
+ dump_length = SAHARA_END_OF_IMAGE_LENGTH / 2;
+
+ context->tx[0]->cmd = cpu_to_le32(SAHARA_MEM_READ64_CMD);
+ context->tx[0]->length = cpu_to_le32(SAHARA_MEM_READ64_LENGTH);
+ context->tx[0]->memory_read64.memory_address =
+ cpu_to_le64(context->dump_image->address + context->dump_image_offset);
+ context->tx[0]->memory_read64.memory_length = cpu_to_le64(dump_length);
+
+ context->dump_image_offset += dump_length;
+ context->rx_size_requested = dump_length;
+
+ ret = mhi_queue_buf(context->mhi_dev, DMA_TO_DEVICE, context->tx[0],
+ SAHARA_MEM_READ64_LENGTH, MHI_EOT);
+ if (ret) {
+ dev_err(&context->mhi_dev->dev,
+ "Unable to send read for dump content %d\n", ret);
+ return;
Unnecessary return statement...
Will fix.
Regards,
Bjorn
+ }
+}
+
+static void sahara_dump_processing(struct work_struct *work)
+{
+ struct sahara_context *context = container_of(work, struct sahara_context, dump_work);
+ int ret;
+
+ /*
+ * We should get the expected raw data, but if the device has an error
+ * it is supposed to send EOI with an error code.
+ */
+ if (context->rx_size != context->rx_size_requested &&
+ context->rx_size != SAHARA_END_OF_IMAGE_LENGTH) {
+ dev_err(&context->mhi_dev->dev,
+ "Unexpected response to read_data. Expected size: %#zx got: %#zx\n",
+ context->rx_size_requested,
+ context->rx_size);
+ goto error;
+ }
+
+ if (context->rx_size == SAHARA_END_OF_IMAGE_LENGTH &&
+ le32_to_cpu(context->rx->cmd) == SAHARA_END_OF_IMAGE_CMD) {
+ dev_err(&context->mhi_dev->dev,
+ "Unexpected EOI response to read_data. Status: %d\n",
+ le32_to_cpu(context->rx->end_of_image.status));
+ goto error;
+ }
+
+ if (context->rx_size == SAHARA_END_OF_IMAGE_LENGTH &&
+ le32_to_cpu(context->rx->cmd) != SAHARA_END_OF_IMAGE_CMD) {
+ dev_err(&context->mhi_dev->dev,
+ "Invalid EOI response to read_data. CMD: %d\n",
+ le32_to_cpu(context->rx->cmd));
+ goto error;
+ }
+
+ /*
+ * Need to know if we received the dump table, or part of a dump image.
+ * Since we get raw data, we cannot tell from the data itself. Instead,
+ * we use the stored dump_table_length, which we zero after we read and
+ * process the entire table.
+ */
+ if (context->dump_table_length)
+ sahara_parse_dump_table(context);
+ else
+ sahara_parse_dump_image(context);
+
+ ret = mhi_queue_buf(context->mhi_dev, DMA_FROM_DEVICE, context->rx,
+ SAHARA_PACKET_MAX_SIZE, MHI_EOT);
+ if (ret)
+ dev_err(&context->mhi_dev->dev, "Unable to requeue rx buf %d\n", ret);
+
+ return;
+
+error:
+ vfree(context->mem_dump);
+ context->mem_dump = NULL;
+ sahara_send_reset(context);
+}
+
static int sahara_mhi_probe(struct mhi_device *mhi_dev, const struct mhi_device_id *id)
{
struct sahara_context *context;
@@ -382,7 +752,8 @@ static int sahara_mhi_probe(struct mhi_device *mhi_dev, const struct mhi_device_
}
context->mhi_dev = mhi_dev;
- INIT_WORK(&context->work, sahara_processing);
+ INIT_WORK(&context->fw_work, sahara_processing);
+ INIT_WORK(&context->dump_work, sahara_dump_processing);
context->image_table = aic100_image_table;
context->table_size = ARRAY_SIZE(aic100_image_table);
context->active_image_id = SAHARA_IMAGE_ID_NONE;
@@ -405,7 +776,10 @@ static void sahara_mhi_remove(struct mhi_device *mhi_dev)
{
struct sahara_context *context = dev_get_drvdata(&mhi_dev->dev);
- cancel_work_sync(&context->work);
+ cancel_work_sync(&context->fw_work);
+ cancel_work_sync(&context->dump_work);
+ if (context->mem_dump)
+ vfree(context->mem_dump);
sahara_release_image(context);
mhi_unprepare_from_transfer(mhi_dev);
}
@@ -418,8 +792,14 @@ static void sahara_mhi_dl_xfer_cb(struct mhi_device *mhi_dev, struct mhi_result
{
struct sahara_context *context = dev_get_drvdata(&mhi_dev->dev);
- if (!mhi_result->transaction_status)
- schedule_work(&context->work);
+ if (!mhi_result->transaction_status) {
+ context->rx_size = mhi_result->bytes_xferd;
+ if (context->is_mem_dump_mode)
+ schedule_work(&context->dump_work);
+ else
+ schedule_work(&context->fw_work);
+ }
+
}
static const struct mhi_device_id sahara_mhi_match_table[] = {
--
2.34.1