Re: [RESEND PATCH v21 5/6] target-arm: kvm64: handle SIGBUS signal from kernel or KVM

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




On 2019/11/22 23:47, Beata Michalska wrote:
> Hi,
> 
> On Mon, 11 Nov 2019 at 01:48, Xiang Zheng <zhengxiang9@xxxxxxxxxx> wrote:
>>
>> From: Dongjiu Geng <gengdongjiu@xxxxxxxxxx>
>>
>> Add a SIGBUS signal handler. In this handler, it checks the SIGBUS type,
>> translates the host VA delivered by host to guest PA, then fills this PA
>> to guest APEI GHES memory, then notifies guest according to the SIGBUS
>> type.
>>
>> When guest accesses the poisoned memory, it will generate a Synchronous
>> External Abort(SEA). Then host kernel gets an APEI notification and calls
>> memory_failure() to unmapped the affected page in stage 2, finally
>> returns to guest.
>>
>> Guest continues to access the PG_hwpoison page, it will trap to KVM as
>> stage2 fault, then a SIGBUS_MCEERR_AR synchronous signal is delivered to
>> Qemu, Qemu records this error address into guest APEI GHES memory and
>> notifes guest using Synchronous-External-Abort(SEA).
>>
>> In order to inject a vSEA, we introduce the kvm_inject_arm_sea() function
>> in which we can setup the type of exception and the syndrome information.
>> When switching to guest, the target vcpu will jump to the synchronous
>> external abort vector table entry.
>>
>> The ESR_ELx.DFSC is set to synchronous external abort(0x10), and the
>> ESR_ELx.FnV is set to not valid(0x1), which will tell guest that FAR is
>> not valid and hold an UNKNOWN value. These values will be set to KVM
>> register structures through KVM_SET_ONE_REG IOCTL.
>>
>> Signed-off-by: Dongjiu Geng <gengdongjiu@xxxxxxxxxx>
>> Signed-off-by: Xiang Zheng <zhengxiang9@xxxxxxxxxx>
>> Reviewed-by: Michael S. Tsirkin <mst@xxxxxxxxxx>
>> ---
>>  hw/acpi/acpi_ghes.c         | 297 ++++++++++++++++++++++++++++++++++++
>>  include/hw/acpi/acpi_ghes.h |   4 +
>>  include/sysemu/kvm.h        |   3 +-
>>  target/arm/cpu.h            |   4 +
>>  target/arm/helper.c         |   2 +-
>>  target/arm/internals.h      |   5 +-
>>  target/arm/kvm64.c          |  64 ++++++++
>>  target/arm/tlb_helper.c     |   2 +-
>>  target/i386/cpu.h           |   2 +
>>  9 files changed, 377 insertions(+), 6 deletions(-)
>>
>> diff --git a/hw/acpi/acpi_ghes.c b/hw/acpi/acpi_ghes.c
>> index 42c00ff3d3..f5b54990c0 100644
>> --- a/hw/acpi/acpi_ghes.c
>> +++ b/hw/acpi/acpi_ghes.c
>> @@ -39,6 +39,34 @@
>>  /* The max size in bytes for one error block */
>>  #define ACPI_GHES_MAX_RAW_DATA_LENGTH       0x1000
>>
>> +/*
>> + * The total size of Generic Error Data Entry
>> + * ACPI 6.1/6.2: 18.3.2.7.1 Generic Error Data,
>> + * Table 18-343 Generic Error Data Entry
>> + */
>> +#define ACPI_GHES_DATA_LENGTH               72
>> +
>> +/*
>> + * The memory section CPER size,
>> + * UEFI 2.6: N.2.5 Memory Error Section
>> + */
>> +#define ACPI_GHES_MEM_CPER_LENGTH           80
>> +
>> +/*
>> + * Masks for block_status flags
>> + */
>> +#define ACPI_GEBS_UNCORRECTABLE         1
> 
> Why not listing all supported statuses ? Similar to error severity below ?
> 
>> +
>> +/*
>> + * Values for error_severity field
>> + */
>> +enum AcpiGenericErrorSeverity {
>> +    ACPI_CPER_SEV_RECOVERABLE,
>> +    ACPI_CPER_SEV_FATAL,
>> +    ACPI_CPER_SEV_CORRECTED,
>> +    ACPI_CPER_SEV_NONE,
>> +};
>> +
>>  /*
>>   * Now only support ARMv8 SEA notification type error source
>>   */
>> @@ -49,6 +77,16 @@
>>   */
>>  #define ACPI_GHES_SOURCE_GENERIC_ERROR_V2   10
>>
>> +#define UUID_BE(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7)        \
>> +    {{{ ((a) >> 24) & 0xff, ((a) >> 16) & 0xff, ((a) >> 8) & 0xff, (a) & 0xff, \
>> +    ((b) >> 8) & 0xff, (b) & 0xff,                   \
>> +    ((c) >> 8) & 0xff, (c) & 0xff,                    \
>> +    (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) } } }
>> +
>> +#define UEFI_CPER_SEC_PLATFORM_MEM                   \
>> +    UUID_BE(0xA5BC1114, 0x6F64, 0x4EDE, 0xB8, 0x63, 0x3E, 0x83, \
>> +    0xED, 0x7C, 0x83, 0xB1)
>> +
>>  /*
>>   * | +--------------------------+ 0
>>   * | |        Header            |
>> @@ -77,6 +115,174 @@ typedef struct AcpiGhesState {
>>      uint64_t ghes_addr_le;
>>  } AcpiGhesState;
>>
>> +/*
>> + * Total size for Generic Error Status Block
>> + * ACPI 6.2: 18.3.2.7.1 Generic Error Data,
>> + * Table 18-380 Generic Error Status Block
>> + */
>> +#define ACPI_GHES_GESB_SIZE                 20
> 
> Minor: This is not entirely correct: GEDE is part of GESB so the total length
> would be ACPI_GHES_GESB_SIZE + n* sizeof(GEDE)
yes, the comments needs to correct.

> 
>> +/* The offset of Data Length in Generic Error Status Block */
>> +#define ACPI_GHES_GESB_DATA_LENGTH_OFFSET   12
>> +
> 
> If those were nicely represented as structures you get the offsets easily
> without having number of defines. That could simplify the code and make it
> more readable - see comments below
> 
>> +/*
>> + * Record the value of data length for each error status block to avoid getting
>> + * this value from guest.
>> + */
>> +static uint32_t acpi_ghes_data_length[ACPI_GHES_ERROR_SOURCE_COUNT];
>> +
>> +/*
>> + * Generic Error Data Entry
>> + * ACPI 6.1: 18.3.2.7.1 Generic Error Data
>> + */
>> +static void acpi_ghes_generic_error_data(GArray *table, QemuUUID section_type,
>> +                uint32_t error_severity, uint16_t revision,
>> +                uint8_t validation_bits, uint8_t flags,
>> +                uint32_t error_data_length, QemuUUID fru_id,
>> +                uint8_t *fru_text, uint64_t time_stamp)
> 
> Why not just defining a struct that represents the GED entry?

This is due to address Igor's comments. there are two reasons:
1. avoid define many structures about APEI/GHES/CPER, so you can see it has very little structures definition in acpi_ghes.h
2. using build_append_int_noprefix() to compose the table can avoid considering endian

> 
>> +{
>> +    QemuUUID uuid_le;
>> +
>> +    /* Section Type */
>> +    uuid_le = qemu_uuid_bswap(section_type);
>> +    g_array_append_vals(table, uuid_le.data, ARRAY_SIZE(uuid_le.data));
>> +
>> +    /* Error Severity */
>> +    build_append_int_noprefix(table, error_severity, 4);
>> +    /* Revision */
>> +    build_append_int_noprefix(table, revision, 2);
> 
> Minor: According to the spec it seems that the revision number is
> a fixed value so you could drop that from the parameters....
> or ... use a struct to represent the data
> 
>> +    /* Validation Bits */
>> +    build_append_int_noprefix(table, validation_bits, 1);
>> +    /* Flags */
>> +    build_append_int_noprefix(table, flags, 1);
>> +    /* Error Data Length */
>> +    build_append_int_noprefix(table, error_data_length, 4);
>> +
>> +    /* FRU Id */
>> +    uuid_le = qemu_uuid_bswap(fru_id);
>> +    g_array_append_vals(table, uuid_le.data, ARRAY_SIZE(uuid_le.data));
>> +
>> +    /* FRU Text */
>> +    g_array_append_vals(table, fru_text, 20);
>> +    /* Timestamp */
>> +    build_append_int_noprefix(table, time_stamp, 8);
>> +}
>> +
>> +/*
>> + * Generic Error Status Block
>> + * ACPI 6.1: 18.3.2.7.1 Generic Error Data
>> + */
>> +static void acpi_ghes_generic_error_status(GArray *table, uint32_t block_status,
>> +                uint32_t raw_data_offset, uint32_t raw_data_length,
>> +                uint32_t data_length, uint32_t error_severity)
> 
> Same as the above
> 
>> +{
>> +    /* Block Status */
>> +    build_append_int_noprefix(table, block_status, 4);
>> +    /* Raw Data Offset */
>> +    build_append_int_noprefix(table, raw_data_offset, 4);
>> +    /* Raw Data Length */
>> +    build_append_int_noprefix(table, raw_data_length, 4);
>> +    /* Data Length */
>> +    build_append_int_noprefix(table, data_length, 4);
>> +    /* Error Severity */
>> +    build_append_int_noprefix(table, error_severity, 4);
>> +}
>> +
>> +/* UEFI 2.6: N.2.5 Memory Error Section */
>> +static void acpi_ghes_build_append_mem_cper(GArray *table,
>> +                                            uint64_t error_physical_addr)
>> +{
>> +    /*
>> +     * Memory Error Record
>> +     */
>> +
>> +    /* Validation Bits */
>> +    build_append_int_noprefix(table,
>> +                              (1UL << 14) | /* Type Valid */
>> +                              (1UL << 1) /* Physical Address Valid */,
>> +                              8);
>> +    /* Error Status */
>> +    build_append_int_noprefix(table, 0, 8);
> 
> Just wondering whether it would be worth to specify the Error Type
> through the Error Status ?
> 
>> +    /* Physical Address */
>> +    build_append_int_noprefix(table, error_physical_addr, 8);
>> +    /* Skip all the detailed information normally found in such a record */
>> +    build_append_int_noprefix(table, 0, 48);
>> +    /* Memory Error Type */
>> +    build_append_int_noprefix(table, 0 /* Unknown error */, 1);
>> +    /* Skip all the detailed information normally found in such a record */
>> +    build_append_int_noprefix(table, 0, 7);
>> +}
>> +
>> +static int acpi_ghes_record_mem_error(uint64_t error_block_address,
>> +                                      uint64_t error_physical_addr,
>> +                                      uint32_t data_length)
>> +{
>> +    GArray *block;
>> +    uint64_t current_block_length;
>> +    /* Memory Error Section Type */
>> +    QemuUUID mem_section_id_le = UEFI_CPER_SEC_PLATFORM_MEM;
> 
> As already mentioned - mixing LE /w BE
> 
>> +    QemuUUID fru_id = {};
>> +    uint8_t fru_text[20] = {};
>> +
>> +    /*
>> +     * Generic Error Status Block
>> +     * | +---------------------+
>> +     * | |     block_status    |
>> +     * | +---------------------+
>> +     * | |    raw_data_offset  |
>> +     * | +---------------------+
>> +     * | |    raw_data_length  |
>> +     * | +---------------------+
>> +     * | |     data_length     |
>> +     * | +---------------------+
>> +     * | |   error_severity    |
>> +     * | +---------------------+
>> +     */
>> +    block = g_array_new(false, true /* clear */, 1);
>> +
>> +    /* The current whole length of the generic error status block */
>> +    current_block_length = ACPI_GHES_GESB_SIZE + data_length;
>> +
>> +    /* This is the length if adding a new generic error data entry*/
>> +    data_length += ACPI_GHES_DATA_LENGTH;
>> +    data_length += ACPI_GHES_MEM_CPER_LENGTH;
>> +
>> +    /*
>> +     * Check whether it will run out of the preallocated memory if adding a new
>> +     * generic error data entry
>> +     */
>> +    if ((data_length + ACPI_GHES_GESB_SIZE) > ACPI_GHES_MAX_RAW_DATA_LENGTH) {
>> +        error_report("Record CPER out of boundary!!!");
> 
> Minor: The error message could be made more accurate, like:
>     "Not enough memory to record new CPER"
> 
>> +        return ACPI_GHES_CPER_FAIL;
>> +    }
>> +
>> +    /* Build the new generic error status block header */
>> +    acpi_ghes_generic_error_status(block, cpu_to_le32(ACPI_GEBS_UNCORRECTABLE),
>> +        0, 0, cpu_to_le32(data_length), cpu_to_le32(ACPI_CPER_SEV_RECOVERABLE));
>> +
>> +    /* Write back above generic error status block header to guest memory */
>> +    cpu_physical_memory_write(error_block_address, block->data,
>> +                              block->len);
>> +
>> +    /* Add a new generic error data entry */
>> +
>> +    data_length = block->len;
>> +    /* Build this new generic error data entry header */
>> +    acpi_ghes_generic_error_data(block, mem_section_id_le,
>> +        cpu_to_le32(ACPI_CPER_SEV_RECOVERABLE), cpu_to_le32(0x300), 0, 0,
>> +        cpu_to_le32(ACPI_GHES_MEM_CPER_LENGTH), fru_id, fru_text, 0);
>> +
>> +    /* Build the memory section CPER for above new generic error data entry */
>> +    acpi_ghes_build_append_mem_cper(block, error_physical_addr);
>> +
>> +    /* Write back above this new generic error data entry to guest memory */
>> +    cpu_physical_memory_write(error_block_address + current_block_length,
>> +        block->data + data_length, block->len - data_length);
>> +
> 
> As already mentioned and unless I have missed smth (which is highly possible)
> this will append new records while the GESB is kept 'in-place'. So the
> used space is
> only growing.
> 
>> +    g_array_free(block, true);
>> +
>> +    return ACPI_GHES_CPER_OK;
>> +}
>> +
>>  /*
>>   * Hardware Error Notification
>>   * ACPI 4.0: 17.3.2.7 Hardware Error Notification
>> @@ -265,3 +471,94 @@ void acpi_ghes_add_fw_cfg(FWCfgState *s, GArray *hardware_error)
>>      fw_cfg_add_file_callback(s, ACPI_GHES_DATA_ADDR_FW_CFG_FILE, NULL, NULL,
>>          NULL, &ges.ghes_addr_le, sizeof(ges.ghes_addr_le), false);
>>  }
>> +
>> +bool acpi_ghes_record_errors(uint32_t notify, uint64_t physical_address)
>> +{
>> +    uint64_t error_block_addr, read_ack_register_addr, read_ack_register = 0;
>> +    int loop = 0;
>> +    uint64_t start_addr = le64_to_cpu(ges.ghes_addr_le);
>> +    bool ret = ACPI_GHES_CPER_FAIL;
>> +    uint8_t source_id;
>> +    const uint8_t error_source_id[] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
>> +                                        0xff, 0xff,    0, 0xff, 0xff, 0xff};
>> +
> 
> I'm not entirely sure why this is needed - se below
> 
>> +    /*
>> +     * | +---------------------+ ges.ghes_addr_le
>> +     * | |error_block_address0 |
>> +     * | +---------------------+ --+--
>> +     * | |    .............    | ACPI_GHES_ADDRESS_SIZE
>> +     * | +---------------------+ --+--
>> +     * | |error_block_addressN |
>> +     * | +---------------------+
>> +     * | | read_ack_register0  |
>> +     * | +---------------------+ --+--
>> +     * | |   .............     | ACPI_GHES_ADDRESS_SIZE
>> +     * | +---------------------+ --+--
>> +     * | | read_ack_registerN  |
>> +     * | +---------------------+ --+--
>> +     * | |      CPER           |   |
>> +     * | |      ....           | ACPI_GHES_MAX_RAW_DATA_LENGT
>> +     * | |      CPER           |   |
>> +     * | +---------------------+ --+--
>> +     * | |    ..........       |
>> +     * | +---------------------+
>> +     * | |      CPER           |
>> +     * | |      ....           |
>> +     * | |      CPER           |
>> +     * | +---------------------+
>> +     */
>> +    if (physical_address && notify < ACPI_GHES_NOTIFY_RESERVED) {
>> +        /* Find and check the source id for this new CPER */
>> +        source_id = error_source_id[notify];
> 
> Why not using switch case for supported source types ?
> For the time being only one is being supported. And you only use that to
> verify that support - seems a bit unnecessary.

Afterwards May be we will many source types to support, so Igor's suggestion is better as shown below.

static const uint8_t ghes_notify2source_id_map[] = {
    ACPI_HEST_SRC_ID_SEA,
    ACPI_HEST_SRC_ID_RESERVED
}


> 
>> +        if (source_id != 0xff) {
>> +            start_addr += source_id * ACPI_GHES_ADDRESS_SIZE;
>> +        } else {
>> +            goto out;
>> +        }
>> +
[...]
>>
>> +/* Callers must hold the iothread mutex lock */
>> +static void kvm_inject_arm_sea(CPUState *c)
> 
> We could enclose this function along with the kvm_arch_on_sigbus_vcpu
> within ifdef switch for KVM_HAVE_MCE_INJECTION
> 
>> +{
>> +    ARMCPU *cpu = ARM_CPU(c);
>> +    CPUARMState *env = &cpu->env;
>> +    CPUClass *cc = CPU_GET_CLASS(c);
>> +    uint32_t esr;
>> +    bool same_el;
>> +
>> +    c->exception_index = EXCP_DATA_ABORT;
>> +    env->exception.target_el = 1;
>> +
>> +    /*
>> +     * Set the DFSC to synchronous external abort and set FnV to not valid,
>> +     * this will tell guest the FAR_ELx is UNKNOWN for this abort.
>> +     */
>> +    same_el = arm_current_el(env) == env->exception.target_el;
>> +    esr = syn_data_abort_no_iss(same_el, 1, 0, 0, 0, 0, 0x10);
> 
> IINM this is the only use case when FnV is considered to be valid
> so I'm not convinced it is worth to modify the syn_data_abort_no_iss
> just for this.

Here we set the FnV to not valid, not to set it to valid.
because Guest will use the physical address that recorded in APEI table.

> 
>> +
>> +    env->exception.syndrome = esr;
>> +
>> +    cc->do_interrupt(c);
>> +}
>> +
>>  #define AARCH64_CORE_REG(x)   (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \
>>                   KVM_REG_ARM_CORE | KVM_REG_ARM_CORE_REG(x))
>>
>> @@ -1036,6 +1062,44 @@ int kvm_arch_get_registers(CPUState *cs)
>>      return ret;
>>  }
>>
>> +void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr)
>> +{
>> +    ram_addr_t ram_addr;
>> +    hwaddr paddr;
>> +
>> +    assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
>> +
>> +    if (acpi_enabled && addr &&
>> +            object_property_get_bool(qdev_get_machine(), "ras", NULL)) {
>> +        ram_addr = qemu_ram_addr_from_host(addr);
>> +        if (ram_addr != RAM_ADDR_INVALID &&
>> +            kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) {
>> +            kvm_hwpoison_page_add(ram_addr);
>> +            /*
>> +             * Asynchronous signal will be masked by main thread, so
>> +             * only handle synchronous signal.
>> +             */
> 
> I'm not entirely sure that the comment above is correct (it has been
> pointed out before). I would expect the AO signal to be handled here as
> well. Not having proper support to do that just yet is another story but
> the comment might be bit misleading.
> 
> 
>> +            if (code == BUS_MCEERR_AR) {
>> +                kvm_cpu_synchronize_state(c);
>> +                if (ACPI_GHES_CPER_FAIL !=
>> +                    acpi_ghes_record_errors(ACPI_GHES_NOTIFY_SEA, paddr)) {
>> +                    kvm_inject_arm_sea(c);
>> +                } else {
>> +                    fprintf(stderr, "failed to record the error\n");
>> +                }
>> +            }
>> +            return;
>> +        }
>> +        fprintf(stderr, "Hardware memory error for memory used by "
>> +                "QEMU itself instead of guest system!\n");
>> +    }
>> +
>> +    if (code == BUS_MCEERR_AR) {
>> +        fprintf(stderr, "Hardware memory error!\n");
>> +        exit(1);
>> +    }
>> +}
>> +
>>  /* C6.6.29 BRK instruction */
>>  static const uint32_t brk_insn = 0xd4200000;
>>
>> diff --git a/target/arm/tlb_helper.c b/target/arm/tlb_helper.c
>> index 5feb312941..499672ebbc 100644
>> --- a/target/arm/tlb_helper.c
>> +++ b/target/arm/tlb_helper.c
>> @@ -33,7 +33,7 @@ static inline uint32_t merge_syn_data_abort(uint32_t template_syn,
>>       * ISV field.
>>       */
>>      if (!(template_syn & ARM_EL_ISV) || target_el != 2 || s1ptw) {
>> -        syn = syn_data_abort_no_iss(same_el,
>> +        syn = syn_data_abort_no_iss(same_el, 0,
>>                                      ea, 0, s1ptw, is_write, fsc);
>>      } else {
>>          /*
>> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
>> index 5352c9ff55..f75a210f96 100644
>> --- a/target/i386/cpu.h
>> +++ b/target/i386/cpu.h
>> @@ -29,6 +29,8 @@
>>  /* The x86 has a strong memory model with some store-after-load re-ordering */
>>  #define TCG_GUEST_DEFAULT_MO      (TCG_MO_ALL & ~TCG_MO_ST_LD)
>>
>> +#define KVM_HAVE_MCE_INJECTION 1
>> +
>>  /* Maximum instruction code size */
>>  #define TARGET_MAX_INSN_SIZE 16
>>
>> --
>> 2.19.1
>>
>>
>>
> .
> 




[Index of Archives]     [KVM ARM]     [KVM ia64]     [KVM ppc]     [Virtualization Tools]     [Spice Development]     [Libvirt]     [Libvirt Users]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite Questions]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux