From: Daniel Wagner <daniel.wagner@xxxxxxxxxxxx> pkg_work_lock is used to serialize the access to pkg_work_scheduled. pkg_temp_thermal_device_add() reallocates pkg_work_scheduled when CPUs are added (When they are removed, pkg_work_scheduled wont be updated). Since pkg_work_scheduled accessed from the interrupt context spin_lock_irqsave() is the right thing on mainline to use. On RT the spin lock is a mutex and therefore we should not sleep in the irq context: [<ffffffff816850ac>] dump_stack+0x4e/0x8f [<ffffffff81680f7d>] __schedule_bug+0xa6/0xb4 [<ffffffff816896b4>] __schedule+0x5b4/0x700 [<ffffffff8168982a>] schedule+0x2a/0x90 [<ffffffff8168a8b5>] rt_spin_lock_slowlock+0xe5/0x2d0 [<ffffffff8168afd5>] rt_spin_lock+0x25/0x30 [<ffffffffa03a7b75>] pkg_temp_thermal_platform_thermal_notify+0x45/0x134 [x86_pkg_temp_thermal] [<ffffffff8103d4db>] ? therm_throt_process+0x1b/0x160 [<ffffffff8103d831>] intel_thermal_interrupt+0x211/0x250 [<ffffffff8103d8c1>] smp_thermal_interrupt+0x21/0x40 [<ffffffff8169415d>] thermal_interrupt+0x6d/0x80 Again, this is a bug on mainline. I didn't find a good way to get rid of the krealloc in pkg_temp_thermal_device_add(). Instead using krealloc I dropped to an open coded version so that the alloc is outside of raw_spin_lock_irqsave(). Signed-off-by: Daniel Wagner <daniel.wagner@xxxxxxxxxxxx> --- Hi, I tested the IRQ handler part but I was not able yet to test the CPU part. I guess this could be tested via qemu just need to figure out how it works. Anyway I tried to get git of pkg_work_scheduled but I have not found an obvious way to do it. I am open to any suggested. cheers, daniel drivers/thermal/x86_pkg_temp_thermal.c | 49 ++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c index 7257366..7d0b0ac 100644 --- a/drivers/thermal/x86_pkg_temp_thermal.c +++ b/drivers/thermal/x86_pkg_temp_thermal.c @@ -83,7 +83,7 @@ static DEFINE_PER_CPU(struct delayed_work, pkg_temp_thermal_threshold_work); static u8 *pkg_work_scheduled; /* Spin lock to prevent races with pkg_work_scheduled */ -static spinlock_t pkg_work_lock; +static DEFINE_RAW_SPINLOCK(pkg_work_lock); static u16 max_phy_id; /* Debug counters to show using debugfs */ @@ -325,14 +325,14 @@ static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work) if (!phdev) return; - spin_lock_irqsave(&pkg_work_lock, flags); + raw_spin_lock_irqsave(&pkg_work_lock, flags); ++pkg_work_cnt; if (unlikely(phy_id > max_phy_id)) { - spin_unlock_irqrestore(&pkg_work_lock, flags); + raw_spin_unlock_irqrestore(&pkg_work_lock, flags); return; } pkg_work_scheduled[phy_id] = 0; - spin_unlock_irqrestore(&pkg_work_lock, flags); + raw_spin_unlock_irqrestore(&pkg_work_lock, flags); enable_pkg_thres_interrupt(); rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); @@ -363,16 +363,16 @@ static int pkg_temp_thermal_platform_thermal_notify(__u64 msr_val) * are in the same interrupt state. So scheduling on any one CPU in * the package is enough and simply return for others. */ - spin_lock_irqsave(&pkg_work_lock, flags); + raw_spin_lock_irqsave(&pkg_work_lock, flags); ++pkg_interrupt_cnt; if (unlikely(phy_id > max_phy_id) || unlikely(!pkg_work_scheduled) || pkg_work_scheduled[phy_id]) { disable_pkg_thres_interrupt(); - spin_unlock_irqrestore(&pkg_work_lock, flags); + raw_spin_unlock_irqrestore(&pkg_work_lock, flags); return -EINVAL; } pkg_work_scheduled[phy_id] = 1; - spin_unlock_irqrestore(&pkg_work_lock, flags); + raw_spin_unlock_irqrestore(&pkg_work_lock, flags); disable_pkg_thres_interrupt(); schedule_delayed_work_on(cpu, @@ -401,7 +401,7 @@ static int pkg_temp_thermal_device_add(unsigned int cpu) char buffer[30]; int thres_count; u32 eax, ebx, ecx, edx; - u8 *temp; + u8 *temp, *p; unsigned long flags; cpuid(6, &eax, &ebx, &ecx, &edx); @@ -426,19 +426,28 @@ static int pkg_temp_thermal_device_add(unsigned int cpu) goto err_ret_unlock; } - spin_lock_irqsave(&pkg_work_lock, flags); - if (topology_physical_package_id(cpu) > max_phy_id) + if (topology_physical_package_id(cpu) > max_phy_id) { max_phy_id = topology_physical_package_id(cpu); - temp = krealloc(pkg_work_scheduled, - (max_phy_id+1) * sizeof(u8), GFP_ATOMIC); - if (!temp) { - spin_unlock_irqrestore(&pkg_work_lock, flags); - err = -ENOMEM; - goto err_ret_free; + + temp = kmalloc((max_phy_id+1) * sizeof(u8), GFP_KERNEL); + if (!temp) { + err = -ENOMEM; + goto err_ret_free; + } + + raw_spin_lock_irqsave(&pkg_work_lock, flags); + + p = pkg_work_scheduled; + + memcpy(temp, pkg_work_scheduled, ksize(pkg_work_scheduled)); + pkg_work_scheduled = temp; + + pkg_work_scheduled[topology_physical_package_id(cpu)] = 0; + + raw_spin_unlock_irqrestore(&pkg_work_lock, flags); + + kfree(p); } - pkg_work_scheduled = temp; - pkg_work_scheduled[topology_physical_package_id(cpu)] = 0; - spin_unlock_irqrestore(&pkg_work_lock, flags); phy_dev_entry->phys_proc_id = topology_physical_package_id(cpu); phy_dev_entry->first_cpu = cpu; @@ -587,7 +596,7 @@ static int __init pkg_temp_thermal_init(void) if (!x86_match_cpu(pkg_temp_thermal_ids)) return -ENODEV; - spin_lock_init(&pkg_work_lock); + raw_spin_lock_init(&pkg_work_lock); platform_thermal_package_notify = pkg_temp_thermal_platform_thermal_notify; platform_thermal_package_rate_control = -- 1.9.0 -- To unsubscribe from this list: send the line "unsubscribe linux-rt-users" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html