On Mon, 02 Jul 2018 11:16:29 +0530 Mahesh J Salgaonkar <mahesh@xxxxxxxxxxxxxxxxxx> wrote: > From: Mahesh Salgaonkar <mahesh@xxxxxxxxxxxxxxxxxx> > > rtas_log_buf is a buffer to hold RTAS event data that are communicated > to kernel by hypervisor. This buffer is then used to pass RTAS event > data to user through proc fs. This buffer is allocated from vmalloc > (non-linear mapping) area. > > On Machine check interrupt, register r3 points to RTAS extended event > log passed by hypervisor that contains the MCE event. The pseries > machine check handler then logs this error into rtas_log_buf. The > rtas_log_buf is a vmalloc-ed (non-linear) buffer we end up taking up a > page fault (vector 0x300) while accessing it. Since machine check > interrupt handler runs in NMI context we can not afford to take any > page fault. Page faults are not honored in NMI context and causes > kernel panic. Apart from that, as Nick pointed out, pSeries_log_error() > also takes a spin_lock while logging error which is not safe in NMI > context. It may endup in deadlock if we get another MCE before releasing > the lock. Fix this by deferring the logging of rtas error to irq work queue. > > Current implementation uses two different buffers to hold rtas error log > depending on whether extended log is provided or not. This makes bit > difficult to identify which buffer has valid data that needs to logged > later in irq work. Simplify this using single buffer, one per paca, and > copy rtas log to it irrespective of whether extended log is provided or > not. Allocate this buffer below RMA region so that it can be accessed > in real mode mce handler. > > Fixes: b96672dd840f ("powerpc: Machine check interrupt is a non-maskable interrupt") > Cc: stable@xxxxxxxxxxxxxxx > Signed-off-by: Mahesh Salgaonkar <mahesh@xxxxxxxxxxxxxxxxxx> I think this looks reasonable. It doesn't fix that commit so much as fixes the problem that's apparent after it's applied. I don't know if we should backport this to a wider set of stable kernels? Aside from that, Reviewed-by: Nicholas Piggin <npiggin@xxxxxxxxx> Thanks, Nick > --- > arch/powerpc/include/asm/paca.h | 3 ++ > arch/powerpc/platforms/pseries/ras.c | 47 ++++++++++++++++++++++---------- > arch/powerpc/platforms/pseries/setup.c | 16 +++++++++++ > 3 files changed, 51 insertions(+), 15 deletions(-) > > diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h > index 3f109a3e3edb..b441fef53077 100644 > --- a/arch/powerpc/include/asm/paca.h > +++ b/arch/powerpc/include/asm/paca.h > @@ -251,6 +251,9 @@ struct paca_struct { > void *rfi_flush_fallback_area; > u64 l1d_flush_size; > #endif > +#ifdef CONFIG_PPC_PSERIES > + u8 *mce_data_buf; /* buffer to hold per cpu rtas errlog */ > +#endif /* CONFIG_PPC_PSERIES */ > } ____cacheline_aligned; > > extern void copy_mm_to_paca(struct mm_struct *mm); > diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c > index ef104144d4bc..14a46b07ab2f 100644 > --- a/arch/powerpc/platforms/pseries/ras.c > +++ b/arch/powerpc/platforms/pseries/ras.c > @@ -22,6 +22,7 @@ > #include <linux/of.h> > #include <linux/fs.h> > #include <linux/reboot.h> > +#include <linux/irq_work.h> > > #include <asm/machdep.h> > #include <asm/rtas.h> > @@ -32,11 +33,13 @@ > static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX]; > static DEFINE_SPINLOCK(ras_log_buf_lock); > > -static char global_mce_data_buf[RTAS_ERROR_LOG_MAX]; > -static DEFINE_PER_CPU(__u64, mce_data_buf); > - > static int ras_check_exception_token; > > +static void mce_process_errlog_event(struct irq_work *work); > +static struct irq_work mce_errlog_process_work = { > + .func = mce_process_errlog_event, > +}; > + > #define EPOW_SENSOR_TOKEN 9 > #define EPOW_SENSOR_INDEX 0 > > @@ -330,16 +333,20 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id) > ((((A) >= 0x7000) && ((A) < 0x7ff0)) || \ > (((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16)))) > > +static inline struct rtas_error_log *fwnmi_get_errlog(void) > +{ > + return (struct rtas_error_log *)local_paca->mce_data_buf; > +} > + > /* > * Get the error information for errors coming through the > * FWNMI vectors. The pt_regs' r3 will be updated to reflect > * the actual r3 if possible, and a ptr to the error log entry > * will be returned if found. > * > - * If the RTAS error is not of the extended type, then we put it in a per > - * cpu 64bit buffer. If it is the extended type we use global_mce_data_buf. > + * Use one buffer mce_data_buf per cpu to store RTAS error. > * > - * The global_mce_data_buf does not have any locks or protection around it, > + * The mce_data_buf does not have any locks or protection around it, > * if a second machine check comes in, or a system reset is done > * before we have logged the error, then we will get corruption in the > * error log. This is preferable over holding off on calling > @@ -349,7 +356,7 @@ static irqreturn_t ras_error_interrupt(int irq, void *dev_id) > static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) > { > unsigned long *savep; > - struct rtas_error_log *h, *errhdr = NULL; > + struct rtas_error_log *h; > > /* Mask top two bits */ > regs->gpr[3] &= ~(0x3UL << 62); > @@ -362,22 +369,20 @@ static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) > savep = __va(regs->gpr[3]); > regs->gpr[3] = savep[0]; /* restore original r3 */ > > - /* If it isn't an extended log we can use the per cpu 64bit buffer */ > h = (struct rtas_error_log *)&savep[1]; > + /* Use the per cpu buffer from paca to store rtas error log */ > + memset(local_paca->mce_data_buf, 0, RTAS_ERROR_LOG_MAX); > if (!rtas_error_extended(h)) { > - memcpy(this_cpu_ptr(&mce_data_buf), h, sizeof(__u64)); > - errhdr = (struct rtas_error_log *)this_cpu_ptr(&mce_data_buf); > + memcpy(local_paca->mce_data_buf, h, sizeof(__u64)); > } else { > int len, error_log_length; > > error_log_length = 8 + rtas_error_extended_log_length(h); > len = min_t(int, error_log_length, RTAS_ERROR_LOG_MAX); > - memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX); > - memcpy(global_mce_data_buf, h, len); > - errhdr = (struct rtas_error_log *)global_mce_data_buf; > + memcpy(local_paca->mce_data_buf, h, len); > } > > - return errhdr; > + return (struct rtas_error_log *)local_paca->mce_data_buf; > } > > /* Call this when done with the data returned by FWNMI_get_errinfo. > @@ -422,6 +427,17 @@ int pSeries_system_reset_exception(struct pt_regs *regs) > return 0; /* need to perform reset */ > } > > +/* > + * Process MCE rtas errlog event. > + */ > +static void mce_process_errlog_event(struct irq_work *work) > +{ > + struct rtas_error_log *err; > + > + err = fwnmi_get_errlog(); > + log_error((char *)err, ERR_TYPE_RTAS_LOG, 0); > +} > + > /* > * See if we can recover from a machine check exception. > * This is only called on power4 (or above) and only via > @@ -466,7 +482,8 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) > recovered = 1; > } > > - log_error((char *)err, ERR_TYPE_RTAS_LOG, 0); > + /* Queue irq work to log this rtas event later. */ > + irq_work_queue(&mce_errlog_process_work); > > return recovered; > } > diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c > index fdb32e056ef4..60a067a6e743 100644 > --- a/arch/powerpc/platforms/pseries/setup.c > +++ b/arch/powerpc/platforms/pseries/setup.c > @@ -41,6 +41,7 @@ > #include <linux/root_dev.h> > #include <linux/of.h> > #include <linux/of_pci.h> > +#include <linux/memblock.h> > > #include <asm/mmu.h> > #include <asm/processor.h> > @@ -101,6 +102,9 @@ static void pSeries_show_cpuinfo(struct seq_file *m) > static void __init fwnmi_init(void) > { > unsigned long system_reset_addr, machine_check_addr; > + u8 *mce_data_buf; > + unsigned int i; > + int nr_cpus = num_possible_cpus(); > > int ibm_nmi_register = rtas_token("ibm,nmi-register"); > if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE) > @@ -114,6 +118,18 @@ static void __init fwnmi_init(void) > if (0 == rtas_call(ibm_nmi_register, 2, 1, NULL, system_reset_addr, > machine_check_addr)) > fwnmi_active = 1; > + > + /* > + * Allocate a chunk for per cpu buffer to hold rtas errorlog. > + * It will be used in real mode mce handler, hence it needs to be > + * below RMA. > + */ > + mce_data_buf = __va(memblock_alloc_base(RTAS_ERROR_LOG_MAX * nr_cpus, > + RTAS_ERROR_LOG_MAX, ppc64_rma_size)); > + for_each_possible_cpu(i) { > + paca_ptrs[i]->mce_data_buf = mce_data_buf + > + (RTAS_ERROR_LOG_MAX * i); > + } > } > > static void pseries_8259_cascade(struct irq_desc *desc) >