Dear RT folks! I'm pleased to announce the v4.4.7-rt16 patch set. Changes since v4.4.7-rt15: - picked a few panic() re-entrance from NMI fixes from upstream. On -RT we have the same problem without NMI but with the soft/hard watchdog triggering panic(). - Don't take the port->lock on oops_in_progress. We had a trylock but that trylock does not work if invoked with IRQs off (like from the panic() caller). I am not very happy about this but if we keep it that way it would make sense to make a similar change for the other UART drivers… - Rik van Riel and Clark Williams pointed out that a change made by Frederic Weisbecker in v4.5 could be backported and then we could remove some locking around vtime handling. Known issues: - CPU hotplug got a little better but can deadlock. The delta patch against 4.4.7-rt15 is appended below and can be found here: https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.4/incr/patch-4.4.7-rt15-rt16.patch.xz You can get this release via the git tree at: git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v4.4.7-rt16 The RT patch against 4.4.7 can be found here: https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.4/patch-4.4.7-rt16.patch.xz The split quilt queue is available at: https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.4/patches-4.4.7-rt16.tar.xz Sebastian diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 697f90db0e37..424aec4a4c71 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -231,7 +231,7 @@ pci_serr_error(unsigned char reason, struct pt_regs *regs) #endif if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); + nmi_panic(regs, "NMI: Not continuing"); pr_emerg("Dazed and confused, but trying to continue\n"); @@ -255,8 +255,16 @@ io_check_error(unsigned char reason, struct pt_regs *regs) reason, smp_processor_id()); show_regs(regs); - if (panic_on_io_nmi) - panic("NMI IOCK error: Not continuing"); + if (panic_on_io_nmi) { + nmi_panic(regs, "NMI IOCK error: Not continuing"); + + /* + * If we end up here, it means we have received an NMI while + * processing panic(). Simply return without delaying and + * re-enabling NMIs. + */ + return; + } /* Re-enable the IOCK line, wait for a few seconds */ reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK; @@ -297,7 +305,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) pr_emerg("Do you have a strange power saving mode enabled?\n"); if (unknown_nmi_panic || panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); + nmi_panic(regs, "NMI: Not continuing"); pr_emerg("Dazed and confused, but trying to continue\n"); } diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index f660d63f40fe..8384207adde2 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -726,6 +726,7 @@ static int crashing_cpu; static nmi_shootdown_cb shootdown_callback; static atomic_t waiting_for_crash_ipi; +static int crash_ipi_issued; static int crash_nmi_callback(unsigned int val, struct pt_regs *regs) { @@ -788,6 +789,9 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) smp_send_nmi_allbutself(); + /* Kick CPUs looping in NMI context. */ + WRITE_ONCE(crash_ipi_issued, 1); + msecs = 1000; /* Wait at most a second for the other cpus to stop */ while ((atomic_read(&waiting_for_crash_ipi) > 0) && msecs) { mdelay(1); @@ -796,6 +800,22 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) /* Leave the nmi callback set */ } + +/* Override the weak function in kernel/panic.c */ +void nmi_panic_self_stop(struct pt_regs *regs) +{ + while (1) { + /* + * Wait for the crash dumping IPI to be issued, and then + * call its callback directly. + */ + if (READ_ONCE(crash_ipi_issued)) + crash_nmi_callback(0, regs); /* Don't return */ + + cpu_relax(); + } +} + #else /* !CONFIG_SMP */ void nmi_shootdown_cpus(nmi_shootdown_cb callback) { diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 91b831a1cc1c..a0b9e854672c 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -2844,9 +2844,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, serial8250_rpm_get(up); - if (port->sysrq) + if (port->sysrq || oops_in_progress) locked = 0; - else if (oops_in_progress || in_kdb_printk()) + else if (in_kdb_printk()) locked = spin_trylock_irqsave(&port->lock, flags); else spin_lock_irqsave(&port->lock, flags); diff --git a/include/linux/init_task.h b/include/linux/init_task.h index d8ec0f202eee..60fadde71a44 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -156,8 +156,7 @@ extern struct task_group root_task_group; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN # define INIT_VTIME(tsk) \ - .vtime_lock = __RAW_SPIN_LOCK_UNLOCKED(tsk.vtime_lock), \ - .vtime_seq = SEQCNT_ZERO(tsk.vtime_seq), \ + .vtime_seqcount = SEQCNT_ZERO(tsk.vtime_seqcount), \ .vtime_snap = 0, \ .vtime_snap_whence = VTIME_SYS, #else diff --git a/include/linux/kernel.h b/include/linux/kernel.h index c44e33a49a08..c84b10d6527d 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -259,6 +259,7 @@ extern long (*panic_blink)(int state); __printf(1, 2) void panic(const char *fmt, ...) __noreturn __cold; +void nmi_panic(struct pt_regs *regs, const char *msg); extern void oops_enter(void); extern void oops_exit(void); void print_oops_end_marker(void); @@ -450,6 +451,14 @@ extern int sysctl_panic_on_stackoverflow; extern bool crash_kexec_post_notifiers; /* + * panic_cpu is used for synchronizing panic() and crash_kexec() execution. It + * holds a CPU number which is executing panic() currently. A value of + * PANIC_CPU_INVALID means no CPU has entered panic() or crash_kexec(). + */ +extern atomic_t panic_cpu; +#define PANIC_CPU_INVALID -1 + +/* * Only to be used by arch init code. If the user over-wrote the default * CONFIG_PANIC_TIMEOUT, honor it. */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 58c5ec8c3742..f9a0f2b540f1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1539,12 +1539,14 @@ struct task_struct { cputime_t gtime; struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN - raw_spinlock_t vtime_lock; - seqcount_t vtime_seq; + seqcount_t vtime_seqcount; unsigned long long vtime_snap; enum { - VTIME_SLEEPING = 0, + /* Task is sleeping or running in a CPU with VTIME inactive */ + VTIME_INACTIVE = 0, + /* Task runs in userspace in a CPU with VTIME active */ VTIME_USER, + /* Task runs in kernelspace in a CPU with VTIME active */ VTIME_SYS, } vtime_snap_whence; #endif diff --git a/kernel/fork.c b/kernel/fork.c index 46c1e8342ad8..4e93b4ea33f7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1379,10 +1379,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, prev_cputime_init(&p->prev_cputime); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN - raw_spin_lock_init(&p->vtime_lock); - seqcount_init(&p->vtime_seq); + seqcount_init(&p->vtime_seqcount); p->vtime_snap = 0; - p->vtime_snap_whence = VTIME_SLEEPING; + p->vtime_snap_whence = VTIME_INACTIVE; #endif #if defined(SPLIT_RSS_COUNTING) diff --git a/kernel/panic.c b/kernel/panic.c index 50d4ae2e7d1b..3535f802953a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -61,6 +61,37 @@ void __weak panic_smp_self_stop(void) cpu_relax(); } +/* + * Stop ourselves in NMI context if another CPU has already panicked. Arch code + * may override this to prepare for crash dumping, e.g. save regs info. + */ +void __weak nmi_panic_self_stop(struct pt_regs *regs) +{ + panic_smp_self_stop(); +} + +atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); + +/* + * A variant of panic() called from NMI context. We return if we've already + * panicked on this CPU. If another CPU already panicked, loop in + * nmi_panic_self_stop() which can provide architecture dependent code such + * as saving register state for crash dump. + */ +void nmi_panic(struct pt_regs *regs, const char *msg) +{ + int old_cpu, cpu; + + cpu = raw_smp_processor_id(); + old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu); + + if (old_cpu == PANIC_CPU_INVALID) + panic("%s", msg); + else if (old_cpu != cpu) + nmi_panic_self_stop(regs); +} +EXPORT_SYMBOL(nmi_panic); + /** * panic - halt the system * @fmt: The text string to print @@ -71,17 +102,17 @@ void __weak panic_smp_self_stop(void) */ void panic(const char *fmt, ...) { - static DEFINE_SPINLOCK(panic_lock); static char buf[1024]; va_list args; long i, i_next = 0; int state = 0; + int old_cpu, this_cpu; /* * Disable local interrupts. This will prevent panic_smp_self_stop * from deadlocking the first cpu that invokes the panic, since * there is nothing to prevent an interrupt handler (that runs - * after the panic_lock is acquired) from invoking panic again. + * after setting panic_cpu) from invoking panic() again. */ local_irq_disable(); @@ -94,8 +125,16 @@ void panic(const char *fmt, ...) * multiple parallel invocations of panic, all other CPUs either * stop themself or will wait until they are stopped by the 1st CPU * with smp_send_stop(). + * + * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which + * comes here, so go ahead. + * `old_cpu == this_cpu' means we came from nmi_panic() which sets + * panic_cpu to this CPU. In this case, this is also the 1st CPU. */ - if (!spin_trylock(&panic_lock)) + this_cpu = raw_smp_processor_id(); + old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); + + if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu) panic_smp_self_stop(); console_verbose(); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index c45f4b026230..4611b1c1cb12 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -680,7 +680,7 @@ static cputime_t get_vtime_delta(struct task_struct *tsk) { unsigned long long delta = vtime_delta(tsk); - WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING); + WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); tsk->vtime_snap += delta; /* CHECKME: always safe to convert nsecs to cputime? */ @@ -696,45 +696,37 @@ static void __vtime_account_system(struct task_struct *tsk) void vtime_account_system(struct task_struct *tsk) { - raw_spin_lock(&tsk->vtime_lock); - write_seqcount_begin(&tsk->vtime_seq); + write_seqcount_begin(&tsk->vtime_seqcount); __vtime_account_system(tsk); - write_seqcount_end(&tsk->vtime_seq); - raw_spin_unlock(&tsk->vtime_lock); + write_seqcount_end(&tsk->vtime_seqcount); } void vtime_gen_account_irq_exit(struct task_struct *tsk) { - raw_spin_lock(&tsk->vtime_lock); - write_seqcount_begin(&tsk->vtime_seq); + write_seqcount_begin(&tsk->vtime_seqcount); __vtime_account_system(tsk); if (context_tracking_in_user()) tsk->vtime_snap_whence = VTIME_USER; - write_seqcount_end(&tsk->vtime_seq); - raw_spin_unlock(&tsk->vtime_lock); + write_seqcount_end(&tsk->vtime_seqcount); } void vtime_account_user(struct task_struct *tsk) { cputime_t delta_cpu; - raw_spin_lock(&tsk->vtime_lock); - write_seqcount_begin(&tsk->vtime_seq); + write_seqcount_begin(&tsk->vtime_seqcount); delta_cpu = get_vtime_delta(tsk); tsk->vtime_snap_whence = VTIME_SYS; account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); - write_seqcount_end(&tsk->vtime_seq); - raw_spin_unlock(&tsk->vtime_lock); + write_seqcount_end(&tsk->vtime_seqcount); } void vtime_user_enter(struct task_struct *tsk) { - raw_spin_lock(&tsk->vtime_lock); - write_seqcount_begin(&tsk->vtime_seq); + write_seqcount_begin(&tsk->vtime_seqcount); __vtime_account_system(tsk); tsk->vtime_snap_whence = VTIME_USER; - write_seqcount_end(&tsk->vtime_seq); - raw_spin_unlock(&tsk->vtime_lock); + write_seqcount_end(&tsk->vtime_seqcount); } void vtime_guest_enter(struct task_struct *tsk) @@ -746,23 +738,19 @@ void vtime_guest_enter(struct task_struct *tsk) * synchronization against the reader (task_gtime()) * that can thus safely catch up with a tickless delta. */ - raw_spin_lock(&tsk->vtime_lock); - write_seqcount_begin(&tsk->vtime_seq); + write_seqcount_begin(&tsk->vtime_seqcount); __vtime_account_system(tsk); current->flags |= PF_VCPU; - write_seqcount_end(&tsk->vtime_seq); - raw_spin_unlock(&tsk->vtime_lock); + write_seqcount_end(&tsk->vtime_seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_enter); void vtime_guest_exit(struct task_struct *tsk) { - raw_spin_lock(&tsk->vtime_lock); - write_seqcount_begin(&tsk->vtime_seq); + write_seqcount_begin(&tsk->vtime_seqcount); __vtime_account_system(tsk); current->flags &= ~PF_VCPU; - write_seqcount_end(&tsk->vtime_seq); - raw_spin_unlock(&tsk->vtime_lock); + write_seqcount_end(&tsk->vtime_seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_exit); @@ -775,30 +763,26 @@ void vtime_account_idle(struct task_struct *tsk) void arch_vtime_task_switch(struct task_struct *prev) { - raw_spin_lock(&prev->vtime_lock); - write_seqcount_begin(&prev->vtime_seq); - prev->vtime_snap_whence = VTIME_SLEEPING; - write_seqcount_end(&prev->vtime_seq); - raw_spin_unlock(&prev->vtime_lock); + write_seqcount_begin(&prev->vtime_seqcount); + prev->vtime_snap_whence = VTIME_INACTIVE; + write_seqcount_end(&prev->vtime_seqcount); - raw_spin_lock(¤t->vtime_lock); - write_seqcount_begin(¤t->vtime_seq); + write_seqcount_begin(¤t->vtime_seqcount); current->vtime_snap_whence = VTIME_SYS; current->vtime_snap = sched_clock_cpu(smp_processor_id()); - write_seqcount_end(¤t->vtime_seq); - raw_spin_unlock(¤t->vtime_lock); + write_seqcount_end(¤t->vtime_seqcount); } void vtime_init_idle(struct task_struct *t, int cpu) { unsigned long flags; - raw_spin_lock_irqsave(&t->vtime_lock, flags); - write_seqcount_begin(&t->vtime_seq); + local_irq_save(flags); + write_seqcount_begin(&t->vtime_seqcount); t->vtime_snap_whence = VTIME_SYS; t->vtime_snap = sched_clock_cpu(cpu); - write_seqcount_end(&t->vtime_seq); - raw_spin_unlock_irqrestore(&t->vtime_lock, flags); + write_seqcount_end(&t->vtime_seqcount); + local_irq_restore(flags); } cputime_t task_gtime(struct task_struct *t) @@ -810,13 +794,13 @@ cputime_t task_gtime(struct task_struct *t) return t->gtime; do { - seq = read_seqcount_begin(&t->vtime_seq); + seq = read_seqcount_begin(&t->vtime_seqcount); gtime = t->gtime; if (t->flags & PF_VCPU) gtime += vtime_delta(t); - } while (read_seqcount_retry(&t->vtime_seq, seq)); + } while (read_seqcount_retry(&t->vtime_seqcount, seq)); return gtime; } @@ -839,7 +823,7 @@ fetch_task_cputime(struct task_struct *t, *udelta = 0; *sdelta = 0; - seq = read_seqcount_begin(&t->vtime_seq); + seq = read_seqcount_begin(&t->vtime_seqcount); if (u_dst) *u_dst = *u_src; @@ -847,7 +831,7 @@ fetch_task_cputime(struct task_struct *t, *s_dst = *s_src; /* Task is sleeping, nothing to add */ - if (t->vtime_snap_whence == VTIME_SLEEPING || + if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t)) continue; @@ -863,7 +847,7 @@ fetch_task_cputime(struct task_struct *t, if (t->vtime_snap_whence == VTIME_SYS) *sdelta = delta; } - } while (read_seqcount_retry(&t->vtime_seq, seq)); + } while (read_seqcount_retry(&t->vtime_seqcount, seq)); } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d974121159ca..47d143740774 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -361,7 +361,7 @@ static void watchdog_overflow_callback(struct perf_event *event, raw_spin_unlock(&watchdog_output_lock); if (hardlockup_panic) - panic("Hard LOCKUP"); + nmi_panic(regs, "Hard LOCKUP"); __this_cpu_write(hard_watchdog_warn, true); return; diff --git a/localversion-rt b/localversion-rt index 18777ec0c27d..1199ebade17b 100644 --- a/localversion-rt +++ b/localversion-rt @@ -1 +1 @@ --rt15 +-rt16 -- To unsubscribe from this list: send the line "unsubscribe linux-rt-users" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html