The patch titled lapic horror has been removed from the -mm tree. Its filename was lapic-horror.patch This patch was dropped because it had testing failures ------------------------------------------------------ Subject: lapic horror From: Thomas Gleixner <tglx@xxxxxxxxxxxxx> > > That looks reasonable. It really boils down to the lapic not working > > when going idle. Can you please give the attached patch on top of -rc2-mm2 a test ride on your jinxed VAIO ? UP and SMP with and without HIGHRES and/or NO_HZ should be fine, when the BIOS cheating on C-States detectiom works as expected. Please boot with "apic=verbose". Also add "lapic" on an UP kernel (I think we should make this the default anyway). The "highres=off" option is also fixed with the patch, so you can disable highres at boottime. I send a nicely split up, sanitized and commented one, when I'm sure that I'm not longer a participant of the plot to make your VAIO defunctional :) Cc: Ingo Molnar <mingo@xxxxxxx> Cc: Arjan van de Ven <arjan@xxxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxx> --- arch/i386/Kconfig | 4 arch/i386/kernel/acpi/boot.c | 5 arch/i386/kernel/apic.c | 2176 ++++++++++++++++---------------- arch/i386/kernel/i8253.c | 5 arch/i386/kernel/io_apic.c | 2 arch/i386/kernel/irq.c | 22 arch/i386/kernel/smpboot.c | 9 arch/i386/kernel/time.c | 5 arch/i386/kernel/time_hpet.c | 22 drivers/acpi/processor_idle.c | 192 ++ drivers/clocksource/acpi_pm.c | 17 include/acpi/processor.h | 4 include/asm-i386/apic.h | 10 include/asm-i386/mpspec.h | 1 include/asm-x86_64/apic.h | 2 include/linux/acpi_pmtmr.h | 37 include/linux/clockchips.h | 17 kernel/hrtimer.c | 10 kernel/time/Kconfig | 4 kernel/time/clockevents.c | 207 ++- 20 files changed, 1629 insertions(+), 1122 deletions(-) diff -puN arch/i386/Kconfig~lapic-horror arch/i386/Kconfig --- a/arch/i386/Kconfig~lapic-horror +++ a/arch/i386/Kconfig @@ -22,6 +22,10 @@ config GENERIC_CLOCKEVENTS bool default y +config GENERIC_CLOCKEVENTS_BROADCAST + bool + default y + config LOCKDEP_SUPPORT bool default y diff -puN arch/i386/kernel/acpi/boot.c~lapic-horror arch/i386/kernel/acpi/boot.c --- a/arch/i386/kernel/acpi/boot.c~lapic-horror +++ a/arch/i386/kernel/acpi/boot.c @@ -25,6 +25,7 @@ #include <linux/init.h> #include <linux/acpi.h> +#include <linux/acpi_pmtmr.h> #include <linux/efi.h> #include <linux/cpumask.h> #include <linux/module.h> @@ -702,10 +703,6 @@ static int __init acpi_parse_hpet(unsign #define acpi_parse_hpet NULL #endif -#ifdef CONFIG_X86_PM_TIMER -extern u32 pmtmr_ioport; -#endif - static int __init acpi_parse_fadt(unsigned long phys, unsigned long size) { struct fadt_descriptor *fadt = NULL; diff -puN arch/i386/kernel/apic.c~lapic-horror arch/i386/kernel/apic.c --- a/arch/i386/kernel/apic.c~lapic-horror +++ a/arch/i386/kernel/apic.c @@ -26,6 +26,7 @@ #include <linux/sysdev.h> #include <linux/cpu.h> #include <linux/clockchips.h> +#include <linux/acpi_pmtmr.h> #include <linux/module.h> #include <asm/atomic.h> @@ -45,29 +46,21 @@ #include "io_ports.h" /* - * cpu_mask that denotes the CPUs that needs timer interrupt coming in as - * IPIs in place of local APIC timers + * Sanity check */ -static cpumask_t timer_bcast_ipi; +#if (SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F +# error SPURIOUS_APIC_VECTOR definition error +#endif /* * Knob to control our willingness to enable the local APIC. + * + * -1=force-disable, +1=force-enable */ -static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ - -static inline void lapic_disable(void) -{ - enable_local_apic = -1; - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); -} - -static inline void lapic_enable(void) -{ - enable_local_apic = 1; -} +static int enable_local_apic __initdata = 0; /* - * Debug level + * Debug level, exported for io_apic.c */ int apic_verbosity; @@ -77,6 +70,8 @@ static void lapic_next_event(unsigned lo struct clock_event_device *evt); static void lapic_timer_setup(enum clock_event_mode mode, struct clock_event_device *evt); +static void lapic_timer_broadcast(cpumask_t *mask); +static void apic_pm_activate(void); /* * The local apic timer can be used for any function which is CPU local. @@ -97,543 +92,709 @@ static struct clock_event_device lapic_c .set_mode = lapic_timer_setup, .set_next_event = lapic_next_event, }; -static DEFINE_PER_CPU(struct clock_event_device, lapic_events); -static void apic_pm_activate(void); +/* + * Per CPU local APIC data structure: + * - clock event device + * - variables to hold timer verification data + */ +struct lapic_event_device { + struct clock_event_device evdev; + unsigned long last_delta; + unsigned long counter; +}; +static DEFINE_PER_CPU(struct lapic_event_device, lapic_events); -static int modern_apic(void) +/* Scaled math multiplication factor for ACPI lapic verification */ +static unsigned long acpi_verify_mult; + +/* Local APIC was disabled by the BIOS and enabled by the kernel */ +static int enabled_via_apicbase; + +/* + * Get the LAPIC version + */ +static inline int lapic_get_version(void) { - unsigned int lvr, version; - /* AMD systems use old APIC versions, so check the CPU */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 >= 0xf) - return 1; - lvr = apic_read(APIC_LVR); - version = GET_APIC_VERSION(lvr); - return version >= 0x14; + return GET_APIC_VERSION(apic_read(APIC_LVR)); } /* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themselves. + * Check, if the APIC is integrated or a seperate chip */ -void ack_bad_irq(unsigned int irq) +static inline int lapic_is_integrated(void) { - printk("unexpected IRQ trap at vector %02x\n", irq); - /* - * Currently unexpected vectors happen only on SMP and APIC. - * We _must_ ack these because every local APIC has only N - * irq slots per priority level, and a 'hanging, unacked' IRQ - * holds up an irq slot - in excessive cases (when multiple - * unexpected vectors occur) that might lock up the APIC - * completely. - * But only ack when the APIC is enabled -AK - */ - if (cpu_has_apic) - ack_APIC_irq(); + return APIC_INTEGRATED(lapic_get_version()); } -void __init apic_intr_init(void) +/* + * Check, whether this is a modern or a first generation APIC + */ +static int modern_apic(void) { -#ifdef CONFIG_SMP - smp_intr_init(); -#endif - /* self generated IPI for local APIC timer */ - set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* IPI vectors for APIC spurious and error interrupts */ - set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - - /* thermal monitor LVT interrupt */ -#ifdef CONFIG_X86_MCE_P4THERMAL - set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); -#endif + /* AMD systems use old APIC versions, so check the CPU */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 >= 0xf) + return 1; + return lapic_get_version() >= 0x14; } -/* Using APIC to generate smp_local_timer_interrupt? */ -int using_apic_timer __read_mostly = 0; - -static int enabled_via_apicbase; - +/** + * enable_NMI_through_LVT0 - enable NMI through local vector table 0 + */ void enable_NMI_through_LVT0 (void * dummy) { - unsigned int v, ver; + unsigned int v = APIC_DM_NMI; - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); - v = APIC_DM_NMI; /* unmask and set to NMI */ - if (!APIC_INTEGRATED(ver)) /* 82489DX */ + /* Level triggered for 82489DX */ + if (!lapic_is_integrated()) v |= APIC_LVT_LEVEL_TRIGGER; apic_write_around(APIC_LVT0, v); } +/** + * get_physical_broadcast - Get number of physical broadcast IDs + */ int get_physical_broadcast(void) { - if (modern_apic()) - return 0xff; - else - return 0xf; + return modern_apic() ? 0xff : 0xf; } -int get_maxlvt(void) +/** + * lapic_get_maxlvt - get the maximum number of local vector table entries + */ +int lapic_get_maxlvt(void) { - unsigned int v, ver, maxlvt; + unsigned int v = apic_read(APIC_LVR); - v = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(v); /* 82489DXs do not report # of LVT entries. */ - maxlvt = APIC_INTEGRATED(ver) ? GET_APIC_MAXLVT(v) : 2; - return maxlvt; + return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; } -void clear_local_APIC(void) -{ - int maxlvt; - unsigned long v; +/* + * Local APIC timer + */ - maxlvt = get_maxlvt(); +/* Clock divisor is set to 16 */ +#define APIC_DIVISOR 16 - /* - * Masking an LVT entry can trigger a local APIC error - * if the vector is zero. Mask LVTERR first to prevent this. - */ - if (maxlvt >= 3) { - v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ - apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); - } - /* - * Careful: we have to set masks only first to deassert - * any level-triggered sources. - */ - v = apic_read(APIC_LVTT); - apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); - v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); - v = apic_read(APIC_LVT1); - apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); - if (maxlvt >= 4) { - v = apic_read(APIC_LVTPC); - apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); - } +/* + * This function sets up the local APIC timer, with a timeout of + * 'clocks' APIC bus clock. During calibration we actually call + * this function twice on the boot CPU, once with a bogus timeout + * value, second time for real. The other (noncalibrating) CPUs + * call this function only once, with the real, calibrated value. + * + * We do reads before writes even if unnecessary, to get around the + * P5 APIC double write bug. + */ +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) +{ + unsigned int lvtt_value, tmp_value; + + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; + if (!lapic_is_integrated()) + lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); + if (!irqen) + lvtt_value |= APIC_LVT_MASKED; + apic_write_around(APIC_LVTT, lvtt_value); -/* lets not touch this if we didn't frob it */ -#ifdef CONFIG_X86_MCE_P4THERMAL - if (maxlvt >= 5) { - v = apic_read(APIC_LVTTHMR); - apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); - } -#endif /* - * Clean APIC state for other OSs: + * Divide PICLK by 16 */ - apic_write_around(APIC_LVTT, APIC_LVT_MASKED); - apic_write_around(APIC_LVT0, APIC_LVT_MASKED); - apic_write_around(APIC_LVT1, APIC_LVT_MASKED); - if (maxlvt >= 3) - apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); - if (maxlvt >= 4) - apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); + tmp_value = apic_read(APIC_TDCR); + apic_write_around(APIC_TDCR, (tmp_value + & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) + | APIC_TDR_DIV_16); -#ifdef CONFIG_X86_MCE_P4THERMAL - if (maxlvt >= 5) - apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); -#endif - v = GET_APIC_VERSION(apic_read(APIC_LVR)); - if (APIC_INTEGRATED(v)) { /* !82489DX */ - if (maxlvt > 3) /* Due to Pentium errata 3AP and 11AP. */ - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - } + if (!oneshot) + apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); } -void __init connect_bsp_APIC(void) +/* + * Program the next event, relative to now + */ +static void lapic_next_event(unsigned long delta, + struct clock_event_device *evt) { - if (pic_mode) { - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - /* - * PIC mode, enable APIC mode in the IMCR, i.e. - * connect BSP's local APIC to INT and NMI lines. - */ - apic_printk(APIC_VERBOSE, "leaving PIC mode, " - "enabling APIC mode.\n"); - outb(0x70, 0x22); - outb(0x01, 0x23); - } - enable_apic_mode(); + struct lapic_event_device *ldev; + + ldev = container_of(evt, struct lapic_event_device, evdev); + ldev->last_delta = delta; + + apic_write_around(APIC_TMICT, delta); } -void disconnect_bsp_APIC(int virt_wire_setup) +/* + * Setup the lapic timer in periodic or oneshot mode + */ +static void lapic_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt) { - if (pic_mode) { - /* - * Put the board back into PIC mode (has an effect - * only on certain older boards). Note that APIC - * interrupts, including IPIs, won't work beyond - * this point! The only exception are INIT IPIs. - */ - apic_printk(APIC_VERBOSE, "disabling APIC mode, " - "entering PIC mode.\n"); - outb(0x70, 0x22); - outb(0x00, 0x23); + struct lapic_event_device *ldev; + unsigned long flags; + unsigned int v; + + ldev = container_of(evt, struct lapic_event_device, evdev); + + local_irq_save(flags); + switch (mode) { + case CLOCK_EVT_PERIODIC: + ldev->last_delta = calibration_result / APIC_DIVISOR; + case CLOCK_EVT_ONESHOT: + __setup_APIC_LVTT(calibration_result, + mode != CLOCK_EVT_PERIODIC, 1); + break; + case CLOCK_EVT_SHUTDOWN: + ldev->last_delta = 0; + v = apic_read(APIC_LVTT); + v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write_around(APIC_LVTT, v); + break; } - else { - /* Go back to Virtual Wire compatibility mode */ - unsigned long value; + local_irq_restore(flags); +} - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write_around(APIC_SPIV, value); +/* + * Setup the local APIC timer for this CPU. Copy the initilized values + * of the boot CPU and register the clock event in the framework. + */ +static void __devinit setup_APIC_timer(void) +{ + struct clock_event_device *levt = &__get_cpu_var(lapic_events).evdev; - if (!virt_wire_setup) { - /* For LVT0 make it edge triggered, active high, external and enabled */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write_around(APIC_LVT0, value); - } - else { - /* Disable LVT0 */ - apic_write_around(APIC_LVT0, APIC_LVT_MASKED); - } + memcpy(levt, &lapic_clockevent, sizeof(*levt)); - /* For LVT1 make it edge triggered, active high, nmi and enabled */ - value = apic_read(APIC_LVT1); - value &= ~( - APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write_around(APIC_LVT1, value); - } + register_local_clockevent(levt); } -void disable_local_APIC(void) +/* + * In this functions we calibrate APIC bus clocks to the external timer. + * + * We want to do the calibration only once since we want to have local timer + * irqs syncron. CPUs connected by the same APIC bus have the very same bus + * frequency. + * + * This was previously done by reading the PIT/HPET and waiting for a wrap + * around to find out, that a tick has elapsed. I have a box, where the PIT + * readout is broken, so it never gets out of the wait loop again. This was + * also reported by others. + * + * Monitoring the jiffies value is inaccurate and the clockevents + * infrastructure allows us to do a simple substitution of the interrupt + * handler. + * + * The calibration routine also uses the pm_timer when possible, as the PIT + * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes + * back to normal later in the boot process). + */ + +#define LAPIC_CAL_LOOPS (HZ/10) + +static __initdata volatile int lapic_cal_loops = -1; +static __initdata long lapic_cal_t1, lapic_cal_t2; +static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; +static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; +static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; + +/* + * Temporary interrupt handler. + */ +static void __init lapic_cal_handler(struct pt_regs *regs) { - unsigned long value; + unsigned long long tsc = 0; + long tapic = apic_read(APIC_TMCCT); + unsigned long pm = acpi_pm_read_early(); - clear_local_APIC(); + if (cpu_has_tsc) + rdtscll(tsc); - /* - * Disable APIC (implies clearing of registers - * for 82489DX!). - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_SPIV_APIC_ENABLED; - apic_write_around(APIC_SPIV, value); + switch (lapic_cal_loops++) { + case 0: + lapic_cal_t1 = tapic; + lapic_cal_tsc1 = tsc; + lapic_cal_pm1 = pm; + lapic_cal_j1 = jiffies; + break; - if (enabled_via_apicbase) { - unsigned int l, h; - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_ENABLE; - wrmsr(MSR_IA32_APICBASE, l, h); + case LAPIC_CAL_LOOPS: + lapic_cal_t2 = tapic; + lapic_cal_tsc2 = tsc; + if (pm < lapic_cal_pm1) + pm += ACPI_PM_OVRRUN; + lapic_cal_pm2 = pm; + lapic_cal_j2 = jiffies; + break; } } /* - * This is to verify that we're looking at a real local APIC. - * Check these against your board if the CPUs aren't getting - * started for no apparent reason. + * Setup the boot APIC + * + * Calibrate and verify the result. */ -int __init verify_local_APIC(void) +void __init setup_boot_APIC_clock(void) { - unsigned int reg0, reg1; - - /* - * The version register is read-only in a real APIC. - */ - reg0 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); - apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); - reg1 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); + struct clock_event_device *levt = &__get_cpu_var(lapic_events).evdev; + const long pm_100ms = PMTMR_TICKS_PER_SEC/10; + const long pm_thresh = pm_100ms/100; + void (*real_handler)(struct pt_regs *regs); + unsigned long deltaj; + long delta, deltapm; + cpumask_t cpumask; + + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" + "calibrating APIC timer ...\n"); + + /* Register broadcast function */ + clockevents_register_broadcast(lapic_timer_broadcast); + + local_irq_disable(); + + /* Replace the global interrupt handler */ + real_handler = global_clock_event->event_handler; + global_clock_event->event_handler = lapic_cal_handler; + + __setup_APIC_LVTT(1, 0, 0); + /* Let the interrupts run */ + local_irq_enable(); + while(lapic_cal_loops < 2); + local_irq_disable(); + lapic_cal_loops = -1; + + /* + * Setup the APIC counter to 1e9. There is no way the lapic + * can underflow in the 100ms detection time frame + */ + __setup_APIC_LVTT(1000000000, 0, 0); + + /* Let the interrupts run */ + local_irq_enable(); + + while(lapic_cal_loops <= LAPIC_CAL_LOOPS); + + local_irq_disable(); + + /* Restore the real event handler */ + global_clock_event->event_handler = real_handler; + + /* Build delta t1-t2 as apic timer counts down */ + delta = lapic_cal_t1 - lapic_cal_t2; + apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); + + /* Check, if the PM timer is available */ + deltapm = lapic_cal_pm2 - lapic_cal_pm1; + apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); + + if (deltapm) { + unsigned long mult; + u64 res; + + mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); + + if (deltapm > (pm_100ms - pm_thresh) && + deltapm < (pm_100ms + pm_thresh)) { + apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); + } else { + res = (((u64) deltapm) * mult) >> 22; + do_div(res, 1000000); + printk(KERN_WARNING "APIC calibration not consistent " + "with PM Timer: %ldms instead of 100ms\n", + (long)res); + /* Correct the lapic counter value */ + res = (((u64) delta ) * pm_100ms); + do_div(res, deltapm); + printk(KERN_INFO "APIC delta adjusted to PM-Timer: " + "%lu (%ld)\n", (unsigned long) res, delta); + delta = (long) res; + } + /* + * Calculate the pmtimer -> lapic conversion factor to + * verify the lapic stability in the power states. + */ + acpi_verify_mult = div_sc(delta, deltapm, 22); + apic_printk(APIC_VERBOSE, "... acpi_verify_mult = %lu\n", + acpi_verify_mult); + } - /* - * The two version reads above should print the same - * numbers. If the second one is different, then we - * poke at a non-APIC. - */ - if (reg1 != reg0) - return 0; + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 32); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); - /* - * Check if the version looks reasonably. - */ - reg1 = GET_APIC_VERSION(reg0); - if (reg1 == 0x00 || reg1 == 0xff) - return 0; - reg1 = get_maxlvt(); - if (reg1 < 0x02 || reg1 == 0xff) - return 0; + calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; + + apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); + apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); + apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", + calibration_result); + + if (cpu_has_tsc) { + delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); + apic_printk(APIC_VERBOSE, "..... CPU clock speed is " + "%ld.%04ld MHz.\n", + (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ), + (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ)); + } + + apic_printk(APIC_VERBOSE, "..... host bus clock speed is " + "%u.%04u MHz.\n", + calibration_result / (1000000 / HZ), + calibration_result % (1000000 / HZ)); - /* - * The ID register is read/write in a real APIC. - */ - reg0 = apic_read(APIC_ID); - apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); + + apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); /* - * The next two are just to see if we have sane values. - * They're only really relevant if we're in Virtual Wire - * compatibility mode, but most boxes are anymore. + * Start LAPIC timer and verify that the calculated factor is correct */ - reg0 = apic_read(APIC_LVT0); - apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); - reg1 = apic_read(APIC_LVT1); - apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); + setup_APIC_timer(); - return 1; + /* Replace the lapic interrupt handler */ + real_handler = levt->event_handler; + levt->event_handler = lapic_cal_handler; + lapic_cal_loops = -1; + + /* Let the interrupts run */ + local_irq_enable(); + + while(lapic_cal_loops <= LAPIC_CAL_LOOPS); + + local_irq_disable(); + + /* Restore the real event handler */ + levt->event_handler = real_handler; + + local_irq_enable(); + + /* Jiffies delta */ + deltaj = lapic_cal_j2 - lapic_cal_j1; + apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); + + /* Check, if the PM timer is available */ + deltapm = lapic_cal_pm2 - lapic_cal_pm1; + apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); + + if (deltapm) { + if (deltapm > (pm_100ms - pm_thresh) && + deltapm < (pm_100ms + pm_thresh)) { + apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); + /* Check, if the jiffies result is consistent */ + if (deltaj < LAPIC_CAL_LOOPS-2 || + deltaj > LAPIC_CAL_LOOPS+2) { + /* + * Not sure, what we can do about this one. + * When high resultion timers are active + * and the lapic timer does not stop in C3 + * we are fine. Otherwise more trouble might + * be waiting. -- tglx + */ + printk(KERN_WARNING "Global event device %s " + "has wrong frequency " + "(%lu ticks instead of %d)\n", + global_clock_event->name, deltaj, + LAPIC_CAL_LOOPS); + } + return; + } + } else { + /* Check, if the jiffies result is consistent */ + if (deltaj >= LAPIC_CAL_LOOPS-2 && + deltaj <= LAPIC_CAL_LOOPS+2) { + apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); + return; + } + } + + printk(KERN_WARNING + "APIC timer disabled due to verification failure.\n"); + local_irq_disable(); + cpumask = cpumask_of_cpu(smp_processor_id()); + switch_APIC_timer_to_ipi(&cpumask); + local_irq_enable(); } -void __init sync_Arb_IDs(void) +void __devinit setup_secondary_APIC_clock(void) { - /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 - And not needed on AMD */ - if (modern_apic()) - return; - /* - * Wait for idle. - */ - apic_wait_icr_idle(); + setup_APIC_timer(); +} - apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG - | APIC_DM_INIT); +void switch_APIC_timer_to_ipi(void *cpumask) +{ + struct clock_event_device *levt = &__get_cpu_var(lapic_events).evdev; + cpumask_t mask = *(cpumask_t *)cpumask; + int cpu = smp_processor_id(); + + if (cpu_isset(cpu, mask)) + clockevents_set_global_broadcast(levt, 1); } +EXPORT_SYMBOL(switch_APIC_timer_to_ipi); + +void switch_ipi_to_APIC_timer(void *cpumask) +{ + struct clock_event_device *levt = &__get_cpu_var(lapic_events).evdev; + cpumask_t mask = *(cpumask_t *)cpumask; + int cpu = smp_processor_id(); -extern void __error_in_apic_c (void); + if (cpu_isset(cpu, mask)) + clockevents_set_global_broadcast(levt, 0); +} +EXPORT_SYMBOL(switch_ipi_to_APIC_timer); /* - * An initial setup of the virtual wire mode. + * The guts of the apic timer interrupt */ -void __init init_bsp_APIC(void) +fastcall void local_apic_timer_interrupt(struct pt_regs *regs) { - unsigned long value, ver; + int cpu = smp_processor_id(); + struct clock_event_device *evt = &per_cpu(lapic_events, cpu).evdev; - /* - * Don't do the setup now if we have a SMP BIOS as the - * through-I/O-APIC virtual wire mode might be active. - */ - if (smp_found_config || !cpu_has_apic) - return; + per_cpu(irq_stat, cpu).apic_timer_irqs++; - value = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(value); + evt->event_handler(regs); +} - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); +/* + * Local APIC timer interrupt. This is the most natural way for doing + * local interrupts, but local timer interrupts can be emulated by + * broadcast interrupts too. [in case the hw doesn't support APIC timers] + * + * [ if a single-CPU system runs an SMP kernel then we call the local + * interrupt as well. Thus we cannot inline the local irq ... ] + */ + +fastcall void smp_apic_timer_interrupt(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); /* - * Enable APIC. + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - - /* This bit is reserved on P4/Xeon and should be cleared */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 15)) - value &= ~APIC_SPIV_FOCUS_DISABLED; - else - value |= APIC_SPIV_FOCUS_DISABLED; - value |= SPURIOUS_APIC_VECTOR; - apic_write_around(APIC_SPIV, value); - + ack_APIC_irq(); /* - * Set up the virtual wire mode. + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. */ - apic_write_around(APIC_LVT0, APIC_DM_EXTINT); - value = APIC_DM_NMI; - if (!APIC_INTEGRATED(ver)) /* 82489DX */ - value |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT1, value); + irq_enter(); + local_apic_timer_interrupt(regs); + irq_exit(); + set_irq_regs(old_regs); } -void __devinit setup_local_APIC(void) +/* + * Local APIC timer broadcast function + */ +static void lapic_timer_broadcast(cpumask_t *cpumask) { - unsigned long oldvalue, value, ver, maxlvt; - int i, j; + int cpu = smp_processor_id(); + cpumask_t mask; - /* Pound the ESR really hard over the head with a big hammer - mbligh */ - if (esr_disable) { - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); + cpus_and(mask, cpu_online_map, *cpumask); + if (cpu_isset(cpu, mask)) { + cpu_clear(cpu, mask); + local_apic_timer_interrupt(get_irq_regs()); } +#ifdef CONFIG_SMP + if (!cpus_empty(mask)) + send_IPI_mask(mask, LOCAL_TIMER_VECTOR); +#endif +} - value = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(value); +/* + * Local APIC set next event broadcast + */ +void lapic_timer_idle_broadcast(int broadcast) +{ + int cpu = smp_processor_id(); + struct clock_event_device *evt = &per_cpu(lapic_events, cpu).evdev; + unsigned long flags; - if ((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f) - __error_in_apic_c(); + local_irq_save(flags); + clockevents_set_broadcast(evt, broadcast); + local_irq_restore(flags); +} - /* - * Double-check whether this APIC is really registered. - */ - if (!apic_id_registered()) - BUG(); +/* + * Local APIC verify that timer is stable during this power state + * + * Called with interrupts disabled. + */ +int lapic_timer_idle_verify(unsigned long ticks) +{ + struct lapic_event_device *dev = &__get_cpu_var(lapic_events); + long delta_apic, delta_pm, delta, counter = apic_read(APIC_TMCCT); + const uint32_t pm_500us = PMTMR_TICKS_PER_SEC/2000; + const long pm_250us = PMTMR_TICKS_PER_SEC/4000; + const long pm_100us = PMTMR_TICKS_PER_SEC/10000; + uint64_t delta_ticks; /* - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... + * Start the verification: Store current time and the apic counter */ - init_apic_ldr(); + if (!ticks) { + dev->counter = counter; + return 0; + } /* - * Set Task Priority to 'accept all'. We never change this - * later on. + * End of verification: + * + * Convert pm timer ticks (from ACPI) to lapic ticks and + * compare with the lapic delta. + * + * We do not make decisions on short sleeps (< 500us) */ - value = apic_read(APIC_TASKPRI); - value &= ~APIC_TPRI_MASK; - apic_write_around(APIC_TASKPRI, value); + if (ticks < pm_500us) + return 0; + delta_ticks = (((u64) ticks) * acpi_verify_mult) >> 22; + delta_pm = (long) delta_ticks; + delta_apic = dev->counter - counter; + /* Take wraps in periodic mode into account */ + if (delta_apic < 0) + delta_apic += dev->last_delta; + + /* Calculate the delta between lapic and pm timer */ + delta = delta_pm - delta_apic; + /* + * The delta between pmtimer and lapic is less than 100us: + * lapic is stable. This catches also delta_pm < delta_apic, + * which happens due to clock skew and rounding errors. + */ + if (delta < pm_100us) { + apic_printk(APIC_VERBOSE, "lapic timer verify: delta %ld " + "pmtimer %ld (%ld) lapic %ld cpu %d\n", delta, + delta_pm, ticks, delta_apic, smp_processor_id()); + return 1; + } /* - * After a crash, we no longer service the interrupts and a pending - * interrupt from previous kernel might still have ISR bit set. - * - * Most probably by now CPU has serviced that pending interrupt and - * it might not have done the ack_APIC_irq() because it thought, - * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it - * does not clear the ISR bit and cpu thinks it has already serivced - * the interrupt. Hence a vector might get locked. It was noticed - * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. + * The delta between pmtimer and lapic is greater than 250us: + * lapic is unstable. */ - for (i = APIC_ISR_NR - 1; i >= 0; i--) { - value = apic_read(APIC_ISR + i*0x10); - for (j = 31; j >= 0; j--) { - if (value & (1<<j)) - ack_APIC_irq(); - } + if (delta > pm_250us) { + apic_printk(APIC_VERBOSE, "lapic timer verify: delta %ld " + "pmtimer %ld (%ld) lapic %ld cpu %d\n", delta, + delta_pm, ticks, delta_apic, smp_processor_id()); + return -1; } + return 0; +} + +int setup_profiling_timer(unsigned int multiplier) +{ + return -EINVAL; +} + +/* + * Local APIC start and shutdown + */ + +/** + * clear_local_APIC - shutdown the local APIC + * + * This is called, when a CPU is disabled and before rebooting, so the state of + * the local APIC has no dangling leftovers. Also used to cleanout any BIOS + * leftovers during boot. + */ +void clear_local_APIC(void) +{ + int maxlvt = lapic_get_maxlvt(); + unsigned long v; /* - * Now that we are all set up, enable the APIC + * Masking an LVT entry can trigger a local APIC error + * if the vector is zero. Mask LVTERR first to prevent this. */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; + if (maxlvt >= 3) { + v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ + apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); + } /* - * Enable APIC + * Careful: we have to set masks only first to deassert + * any level-triggered sources. */ - value |= APIC_SPIV_APIC_ENABLED; + v = apic_read(APIC_LVTT); + apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); + v = apic_read(APIC_LVT0); + apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); + v = apic_read(APIC_LVT1); + apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); + if (maxlvt >= 4) { + v = apic_read(APIC_LVTPC); + apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); + } + /* lets not touch this if we didn't frob it */ +#ifdef CONFIG_X86_MCE_P4THERMAL + if (maxlvt >= 5) { + v = apic_read(APIC_LVTTHMR); + apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); + } +#endif /* - * Some unknown Intel IO/APIC (or APIC) errata is biting us with - * certain networking cards. If high frequency interrupts are - * happening on a particular IOAPIC pin, plus the IOAPIC routing - * entry is masked/unmasked at a high rate as well then sooner or - * later IOAPIC line gets 'stuck', no more interrupts are received - * from the device. If focus CPU is disabled then the hang goes - * away, oh well :-( - * - * [ This bug can be reproduced easily with a level-triggered - * PCI Ne2000 networking cards and PII/PIII processors, dual - * BX chipset. ] - */ - /* - * Actually disabling the focus CPU check just makes the hang less - * frequent as it makes the interrupt distributon model be more - * like LRU than MRU (the short-term load is more even across CPUs). - * See also the comment in end_level_ioapic_irq(). --macro + * Clean APIC state for other OSs: */ -#if 1 - /* Enable focus processor (bit==0) */ - value &= ~APIC_SPIV_FOCUS_DISABLED; -#else - /* Disable focus processor (bit==1) */ - value |= APIC_SPIV_FOCUS_DISABLED; + apic_write_around(APIC_LVTT, APIC_LVT_MASKED); + apic_write_around(APIC_LVT0, APIC_LVT_MASKED); + apic_write_around(APIC_LVT1, APIC_LVT_MASKED); + if (maxlvt >= 3) + apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); + if (maxlvt >= 4) + apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); + +#ifdef CONFIG_X86_MCE_P4THERMAL + if (maxlvt >= 5) + apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); #endif - /* - * Set spurious IRQ vector - */ - value |= SPURIOUS_APIC_VECTOR; - apic_write_around(APIC_SPIV, value); + /* Integrated APIC (!82489DX) ? */ + if (lapic_is_integrated()) { + if (maxlvt > 3) + /* Clear ESR due to Pentium errata 3AP and 11AP */ + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } +} + +/** + * disable_local_APIC - clear and disable the local APIC + */ +void disable_local_APIC(void) +{ + unsigned long value; + + clear_local_APIC(); /* - * Set up LVT0, LVT1: - * - * set up through-local-APIC on the BP's LINT0. This is not - * strictly necessery in pure symmetric-IO mode, but sometimes - * we delegate interrupts to the 8259A. - */ - /* - * TODO: set up through-local-APIC from through-I/O-APIC? --macro + * Disable APIC (implies clearing of registers + * for 82489DX!). */ - value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; - if (!smp_processor_id() && (pic_mode || !value)) { - value = APIC_DM_EXTINT; - apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", - smp_processor_id()); - } else { - value = APIC_DM_EXTINT | APIC_LVT_MASKED; - apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", - smp_processor_id()); - } - apic_write_around(APIC_LVT0, value); + value = apic_read(APIC_SPIV); + value &= ~APIC_SPIV_APIC_ENABLED; + apic_write_around(APIC_SPIV, value); /* - * only the BP should see the LINT1 NMI signal, obviously. + * When LAPIC was disabled by the BIOS and enabled by the kernel, + * restore the disabled state. */ - if (!smp_processor_id()) - value = APIC_DM_NMI; - else - value = APIC_DM_NMI | APIC_LVT_MASKED; - if (!APIC_INTEGRATED(ver)) /* 82489DX */ - value |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT1, value); - - if (APIC_INTEGRATED(ver) && !esr_disable) { /* !82489DX */ - maxlvt = get_maxlvt(); - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - oldvalue = apic_read(APIC_ESR); + if (enabled_via_apicbase) { + unsigned int l, h; - value = ERROR_APIC_VECTOR; // enables sending errors - apic_write_around(APIC_LVTERR, value); - /* - * spec says clear errors after enabling vector. - */ - if (maxlvt > 3) - apic_write(APIC_ESR, 0); - value = apic_read(APIC_ESR); - if (value != oldvalue) - apic_printk(APIC_VERBOSE, "ESR value before enabling " - "vector: 0x%08lx after: 0x%08lx\n", - oldvalue, value); - } else { - if (esr_disable) - /* - * Something untraceble is creating bad interrupts on - * secondary quads ... for the moment, just leave the - * ESR disabled - we can't do anything useful with the - * errors anyway - mbligh - */ - printk("Leaving ESR disabled.\n"); - else - printk("No ESR for 82489DX.\n"); + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_ENABLE; + wrmsr(MSR_IA32_APICBASE, l, h); } - - setup_apic_nmi_watchdog(NULL); - apic_pm_activate(); } /* - * If Linux enabled the LAPIC against the BIOS default - * disable it down before re-entering the BIOS on shutdown. - * Otherwise the BIOS may get confused and not power-off. - * Additionally clear all LVT entries before disable_local_APIC + * If Linux enabled the LAPIC against the BIOS default disable it down before + * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and + * not power-off. Additionally clear all LVT entries before disable_local_APIC * for the case where Linux didn't enable the LAPIC. */ void lapic_shutdown(void) @@ -652,154 +813,301 @@ void lapic_shutdown(void) local_irq_restore(flags); } -#ifdef CONFIG_PM - -static struct { - int active; - /* r/w apic fields */ - unsigned int apic_id; - unsigned int apic_taskpri; - unsigned int apic_ldr; - unsigned int apic_dfr; - unsigned int apic_spiv; - unsigned int apic_lvtt; - unsigned int apic_lvtpc; - unsigned int apic_lvt0; - unsigned int apic_lvt1; - unsigned int apic_lvterr; - unsigned int apic_tmict; - unsigned int apic_tdcr; - unsigned int apic_thmr; -} apic_pm_state; - -static int lapic_suspend(struct sys_device *dev, pm_message_t state) +/* + * This is to verify that we're looking at a real local APIC. + * Check these against your board if the CPUs aren't getting + * started for no apparent reason. + */ +int __init verify_local_APIC(void) { - unsigned long flags; - - if (!apic_pm_state.active) - return 0; + unsigned int reg0, reg1; - apic_pm_state.apic_id = apic_read(APIC_ID); - apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); - apic_pm_state.apic_ldr = apic_read(APIC_LDR); - apic_pm_state.apic_dfr = apic_read(APIC_DFR); - apic_pm_state.apic_spiv = apic_read(APIC_SPIV); - apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); - apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); - apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); - apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); - apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); - apic_pm_state.apic_tmict = apic_read(APIC_TMICT); - apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); - apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); - - local_irq_save(flags); - disable_local_APIC(); - local_irq_restore(flags); - return 0; -} + /* + * The version register is read-only in a real APIC. + */ + reg0 = apic_read(APIC_LVR); + apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); + apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); + reg1 = apic_read(APIC_LVR); + apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); -static int lapic_resume(struct sys_device *dev) -{ - unsigned int l, h; - unsigned long flags; + /* + * The two version reads above should print the same + * numbers. If the second one is different, then we + * poke at a non-APIC. + */ + if (reg1 != reg0) + return 0; - if (!apic_pm_state.active) + /* + * Check if the version looks reasonably. + */ + reg1 = GET_APIC_VERSION(reg0); + if (reg1 == 0x00 || reg1 == 0xff) + return 0; + reg1 = lapic_get_maxlvt(); + if (reg1 < 0x02 || reg1 == 0xff) return 0; - local_irq_save(flags); + /* + * The ID register is read/write in a real APIC. + */ + reg0 = apic_read(APIC_ID); + apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); /* - * Make sure the APICBASE points to the right address - * - * FIXME! This will be wrong if we ever support suspend on - * SMP! We'll need to do this as part of the CPU restore! + * The next two are just to see if we have sane values. + * They're only really relevant if we're in Virtual Wire + * compatibility mode, but most boxes are anymore. */ - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; - wrmsr(MSR_IA32_APICBASE, l, h); + reg0 = apic_read(APIC_LVT0); + apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); + reg1 = apic_read(APIC_LVT1); + apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); - apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); - apic_write(APIC_ID, apic_pm_state.apic_id); - apic_write(APIC_DFR, apic_pm_state.apic_dfr); - apic_write(APIC_LDR, apic_pm_state.apic_ldr); - apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); - apic_write(APIC_SPIV, apic_pm_state.apic_spiv); - apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); - apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); - apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); - apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); - apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); - apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); - apic_write(APIC_TMICT, apic_pm_state.apic_tmict); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - local_irq_restore(flags); - return 0; + return 1; } -/* - * This device has no shutdown method - fully functioning local APICs - * are needed on every CPU up until machine_halt/restart/poweroff. +/** + * sync_Arb_IDs - synchronize APIC bus arbitration IDs */ - -static struct sysdev_class lapic_sysclass = { - set_kset_name("lapic"), - .resume = lapic_resume, - .suspend = lapic_suspend, -}; - -static struct sys_device device_lapic = { - .id = 0, - .cls = &lapic_sysclass, -}; - -static void __devinit apic_pm_activate(void) +void __init sync_Arb_IDs(void) { - apic_pm_state.active = 1; + /* + * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not + * needed on AMD. + */ + if (modern_apic()) + return; + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + + apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); + apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG + | APIC_DM_INIT); } -static int __init init_lapic_sysfs(void) +/* + * An initial setup of the virtual wire mode. + */ +void __init init_bsp_APIC(void) { - int error; + unsigned long value; - if (!cpu_has_apic) - return 0; - /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ + /* + * Don't do the setup now if we have a SMP BIOS as the + * through-I/O-APIC virtual wire mode might be active. + */ + if (smp_found_config || !cpu_has_apic) + return; - error = sysdev_class_register(&lapic_sysclass); - if (!error) - error = sysdev_register(&device_lapic); - return error; -} -device_initcall(init_lapic_sysfs); + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); -#else /* CONFIG_PM */ + /* + * Enable APIC. + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; -static void apic_pm_activate(void) { } + /* This bit is reserved on P4/Xeon and should be cleared */ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + (boot_cpu_data.x86 == 15)) + value &= ~APIC_SPIV_FOCUS_DISABLED; + else + value |= APIC_SPIV_FOCUS_DISABLED; + value |= SPURIOUS_APIC_VECTOR; + apic_write_around(APIC_SPIV, value); -#endif /* CONFIG_PM */ + /* + * Set up the virtual wire mode. + */ + apic_write_around(APIC_LVT0, APIC_DM_EXTINT); + value = APIC_DM_NMI; + if (!lapic_is_integrated()) /* 82489DX */ + value |= APIC_LVT_LEVEL_TRIGGER; + apic_write_around(APIC_LVT1, value); +} -/* - * Detect and enable local APICs on non-SMP boards. - * Original code written by Keir Fraser. +/** + * setup_local_APIC - setup the local APIC */ - -static int __init apic_set_verbosity(char *str) +void __devinit setup_local_APIC(void) { - if (strcmp("debug", str) == 0) - apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", str) == 0) - apic_verbosity = APIC_VERBOSE; - return 1; -} + unsigned long oldvalue, value, maxlvt, integrated; + int i, j; -__setup("apic=", apic_set_verbosity); + /* Pound the ESR really hard over the head with a big hammer - mbligh */ + if (esr_disable) { + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + } + + integrated = lapic_is_integrated(); + + /* + * Double-check whether this APIC is really registered. + */ + if (!apic_id_registered()) + BUG(); + + /* + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ + init_apic_ldr(); + + /* + * Set Task Priority to 'accept all'. We never change this + * later on. + */ + value = apic_read(APIC_TASKPRI); + value &= ~APIC_TPRI_MASK; + apic_write_around(APIC_TASKPRI, value); + + /* + * After a crash, we no longer service the interrupts and a pending + * interrupt from previous kernel might still have ISR bit set. + * + * Most probably by now CPU has serviced that pending interrupt and + * it might not have done the ack_APIC_irq() because it thought, + * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it + * does not clear the ISR bit and cpu thinks it has already serivced + * the interrupt. Hence a vector might get locked. It was noticed + * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. + */ + for (i = APIC_ISR_NR - 1; i >= 0; i--) { + value = apic_read(APIC_ISR + i*0x10); + for (j = 31; j >= 0; j--) { + if (value & (1<<j)) + ack_APIC_irq(); + } + } + + /* + * Now that we are all set up, enable the APIC + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + /* + * Enable APIC + */ + value |= APIC_SPIV_APIC_ENABLED; + + /* + * Some unknown Intel IO/APIC (or APIC) errata is biting us with + * certain networking cards. If high frequency interrupts are + * happening on a particular IOAPIC pin, plus the IOAPIC routing + * entry is masked/unmasked at a high rate as well then sooner or + * later IOAPIC line gets 'stuck', no more interrupts are received + * from the device. If focus CPU is disabled then the hang goes + * away, oh well :-( + * + * [ This bug can be reproduced easily with a level-triggered + * PCI Ne2000 networking cards and PII/PIII processors, dual + * BX chipset. ] + */ + /* + * Actually disabling the focus CPU check just makes the hang less + * frequent as it makes the interrupt distributon model be more + * like LRU than MRU (the short-term load is more even across CPUs). + * See also the comment in end_level_ioapic_irq(). --macro + */ + + /* Enable focus processor (bit==0) */ + value &= ~APIC_SPIV_FOCUS_DISABLED; + + /* + * Set spurious IRQ vector + */ + value |= SPURIOUS_APIC_VECTOR; + apic_write_around(APIC_SPIV, value); + + /* + * Set up LVT0, LVT1: + * + * set up through-local-APIC on the BP's LINT0. This is not + * strictly necessery in pure symmetric-IO mode, but sometimes + * we delegate interrupts to the 8259A. + */ + /* + * TODO: set up through-local-APIC from through-I/O-APIC? --macro + */ + value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; + if (!smp_processor_id() && (pic_mode || !value)) { + value = APIC_DM_EXTINT; + apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", + smp_processor_id()); + } else { + value = APIC_DM_EXTINT | APIC_LVT_MASKED; + apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", + smp_processor_id()); + } + apic_write_around(APIC_LVT0, value); + /* + * only the BP should see the LINT1 NMI signal, obviously. + */ + if (!smp_processor_id()) + value = APIC_DM_NMI; + else + value = APIC_DM_NMI | APIC_LVT_MASKED; + if (!integrated) /* 82489DX */ + value |= APIC_LVT_LEVEL_TRIGGER; + apic_write_around(APIC_LVT1, value); + + if (integrated && !esr_disable) { /* !82489DX */ + maxlvt = lapic_get_maxlvt(); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + oldvalue = apic_read(APIC_ESR); + + /* enables sending errors */ + value = ERROR_APIC_VECTOR; + apic_write_around(APIC_LVTERR, value); + /* + * spec says clear errors after enabling vector. + */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); + value = apic_read(APIC_ESR); + if (value != oldvalue) + apic_printk(APIC_VERBOSE, "ESR value before enabling " + "vector: 0x%08lx after: 0x%08lx\n", + oldvalue, value); + } else { + if (esr_disable) + /* + * Something untraceble is creating bad interrupts on + * secondary quads ... for the moment, just leave the + * ESR disabled - we can't do anything useful with the + * errors anyway - mbligh + */ + printk(KERN_INFO "Leaving ESR disabled.\n"); + else + printk(KERN_INFO "No ESR for 82489DX.\n"); + } + + /* Disabled the local apic timer */ + value = apic_read(APIC_LVTT); + value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write_around(APIC_LVTT, value); + + setup_apic_nmi_watchdog(NULL); + apic_pm_activate(); +} + +/* + * Detect and initialize APIC + */ static int __init detect_init_APIC (void) { u32 h, l, features; @@ -811,7 +1119,7 @@ static int __init detect_init_APIC (void switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_AMD: if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) || - (boot_cpu_data.x86 == 15)) + (boot_cpu_data.x86 == 15)) break; goto no_apic; case X86_VENDOR_INTEL: @@ -825,23 +1133,23 @@ static int __init detect_init_APIC (void if (!cpu_has_apic) { /* - * Over-ride BIOS and try to enable the local - * APIC only if "lapic" specified. + * Over-ride BIOS and try to enable the local APIC only if + * "lapic" specified. */ if (enable_local_apic <= 0) { - printk("Local APIC disabled by BIOS -- " + printk(KERN_INFO "Local APIC disabled by BIOS -- " "you can enable it with \"lapic\"\n"); return -1; } /* - * Some BIOSes disable the local APIC in the - * APIC_BASE MSR. This can only be done in - * software for Intel P6 or later and AMD K7 - * (Model > 1) or later. + * Some BIOSes disable the local APIC in the APIC_BASE + * MSR. This can only be done in software for Intel P6 or later + * and AMD K7 (Model > 1) or later. */ rdmsr(MSR_IA32_APICBASE, l, h); if (!(l & MSR_IA32_APICBASE_ENABLE)) { - printk("Local APIC disabled by BIOS -- reenabling.\n"); + printk(KERN_INFO + "Local APIC disabled by BIOS -- reenabling.\n"); l &= ~MSR_IA32_APICBASE_BASE; l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; wrmsr(MSR_IA32_APICBASE, l, h); @@ -854,7 +1162,7 @@ static int __init detect_init_APIC (void */ features = cpuid_edx(1); if (!(features & (1 << X86_FEATURE_APIC))) { - printk("Could not enable APIC!\n"); + printk(KERN_WARNING "Could not enable APIC!\n"); return -1; } set_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); @@ -868,17 +1176,20 @@ static int __init detect_init_APIC (void if (nmi_watchdog != NMI_NONE) nmi_watchdog = NMI_LOCAL_APIC; - printk("Found and enabled local APIC!\n"); + printk(KERN_INFO "Found and enabled local APIC!\n"); apic_pm_activate(); return 0; no_apic: - printk("No local APIC present or hardware disabled\n"); + printk(KERN_INFO "No local APIC present or hardware disabled\n"); return -1; } +/** + * init_apic_mappings - initialize APIC mappings + */ void __init init_apic_mappings(void) { unsigned long apic_phys; @@ -938,448 +1249,88 @@ fake_ioapic_page: } /* - * This part sets up the APIC 32 bit clock in LVTT1, with HZ interrupts - * per second. We assume that the caller has already set up the local - * APIC. - * - * The APIC timer is not exactly sync with the external timer chip, it - * closely follows bus clocks. - */ - -/* - * FIXME: Move this to i8253.h. There is no need to keep the access to - * the PIT scattered all around the place -tglx - */ - -/* - * The timer chip is already set up at HZ interrupts per second here, - * but we do not accept timer interrupts yet. We only allow the BP - * to calibrate. - */ -static unsigned int __devinit get_8254_timer_count(void) -{ - unsigned long flags; - - unsigned int count; - - spin_lock_irqsave(&i8253_lock, flags); - - outb_p(0x00, PIT_MODE); - count = inb_p(PIT_CH0); - count |= inb_p(PIT_CH0) << 8; - - spin_unlock_irqrestore(&i8253_lock, flags); - - return count; -} - -/* next tick in 8254 can be caught by catching timer wraparound */ -static void __devinit wait_8254_wraparound(void) -{ - unsigned int curr_count, prev_count; - - curr_count = get_8254_timer_count(); - do { - prev_count = curr_count; - curr_count = get_8254_timer_count(); - - /* workaround for broken Mercury/Neptune */ - if (prev_count >= curr_count + 0x100) - curr_count = get_8254_timer_count(); - - } while (prev_count >= curr_count); -} - -/* - * Default initialization for 8254 timers. If we use other timers like HPET, - * we override this later - */ -void (*wait_timer_tick)(void) __devinitdata = wait_8254_wraparound; - -/* - * This function sets up the local APIC timer, with a timeout of - * 'clocks' APIC bus clock. During calibration we actually call - * this function twice on the boot CPU, once with a bogus timeout - * value, second time for real. The other (noncalibrating) CPUs - * call this function only once, with the real, calibrated value. - * - * We do reads before writes even if unnecessary, to get around the - * P5 APIC double write bug. + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. */ - -#define APIC_DIVISOR 16 - -static void __setup_APIC_LVTT(unsigned int clocks, int oneshot) +int __init APIC_init_uniprocessor (void) { - unsigned int lvtt_value, tmp_value, ver; - int cpu = smp_processor_id(); - - ver = GET_APIC_VERSION(apic_read(APIC_LVR)); - lvtt_value = LOCAL_TIMER_VECTOR; - if (!oneshot) - lvtt_value |= APIC_LVT_TIMER_PERIODIC; - if (!APIC_INTEGRATED(ver)) - lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); - - if (cpu_isset(cpu, timer_bcast_ipi)) - lvtt_value |= APIC_LVT_MASKED; + if (enable_local_apic < 0) + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); - apic_write_around(APIC_LVTT, lvtt_value); + if (!smp_found_config && !cpu_has_apic) + return -1; /* - * Divide PICLK by 16 + * Complain if the BIOS pretends there is one. */ - tmp_value = apic_read(APIC_TDCR); - apic_write_around(APIC_TDCR, (tmp_value - & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) - | APIC_TDR_DIV_16); - - if (!oneshot) - apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); -} - -static void lapic_next_event(unsigned long delta, - struct clock_event_device *evt) -{ - apic_write_around(APIC_TMICT, delta); -} - -static void lapic_timer_setup(enum clock_event_mode mode, - struct clock_event_device *evt) -{ - unsigned long flags; - - local_irq_save(flags); - if (CLOCK_EVT_PERIODIC) { - /* - * Wait for IRQ0's slice: - */ - wait_timer_tick(); + if (!cpu_has_apic && + APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", + boot_cpu_physical_apicid); + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + return -1; } - __setup_APIC_LVTT(calibration_result, mode != CLOCK_EVT_PERIODIC); - local_irq_restore(flags); -} - -/* - * Setup the local APIC timer for this CPU. Copy the initilized values - * of the boot CPU and register the clock event in the framework. - */ -static void __devinit setup_APIC_timer(void) -{ - struct clock_event_device *levt = &__get_cpu_var(lapic_events); - - memcpy(levt, &lapic_clockevent, sizeof(*levt)); - register_local_clockevent(levt); -} - -/* - * In this function we calibrate APIC bus clocks to the external - * timer. Unfortunately we cannot use jiffies and the timer irq - * to calibrate, since some later bootup code depends on getting - * the first irq? Ugh. - * - * TODO: Fix this rather than saying "Ugh" -tglx - * - * We want to do the calibration only once since we - * want to have local timer irqs syncron. CPUs connected - * by the same APIC bus have the very same bus frequency. - * And we want to have irqs off anyways, no accidental - * APIC irq that way. - */ - -static int __init calibrate_APIC_clock(void) -{ - unsigned long long t1 = 0, t2 = 0; - long tt1, tt2; - long result; - int i; - const int LOOPS = HZ/10; - - apic_printk(APIC_VERBOSE, "calibrating APIC timer ...\n"); - - /* - * Put whatever arbitrary (but long enough) timeout - * value into the APIC clock, we just want to get the - * counter running for calibration. - */ - __setup_APIC_LVTT(1000000000, 0); - - /* - * The timer chip counts down to zero. Let's wait - * for a wraparound to start exact measurement: - * (the current tick might have been already half done) - */ - - wait_timer_tick(); - - /* - * We wrapped around just now. Let's start: - */ - if (cpu_has_tsc) - rdtscll(t1); - tt1 = apic_read(APIC_TMCCT); - - /* - * Let's wait LOOPS wraprounds: - */ - for (i = 0; i < LOOPS; i++) - wait_timer_tick(); - - tt2 = apic_read(APIC_TMCCT); - if (cpu_has_tsc) - rdtscll(t2); - - /* - * The APIC bus clock counter is 32 bits only, it - * might have overflown, but note that we use signed - * longs, thus no extra care needed. - * - * underflown to be exact, as the timer counts down ;) - */ - - result = (tt1-tt2)*APIC_DIVISOR/LOOPS; - - /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(tt1-tt2, TICK_NSEC * LOOPS, 32); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - - apic_printk(APIC_VERBOSE, "..... tt1-tt2 %ld\n", tt1 - tt2); - apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); - apic_printk(APIC_VERBOSE, "..... calibration result: %ld\n", result); - - if (cpu_has_tsc) - apic_printk(APIC_VERBOSE, "..... CPU clock speed is " - "%ld.%04ld MHz.\n", - ((long)(t2-t1)/LOOPS)/(1000000/HZ), - ((long)(t2-t1)/LOOPS)%(1000000/HZ)); - - apic_printk(APIC_VERBOSE, "..... host bus clock speed is " - "%ld.%04ld MHz.\n", - result/(1000000/HZ), - result%(1000000/HZ)); - - return result; -} - -void __init setup_boot_APIC_clock(void) -{ - unsigned long flags; - apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"); - using_apic_timer = 1; + verify_local_APIC(); - local_irq_save(flags); + connect_bsp_APIC(); - calibration_result = calibrate_APIC_clock(); /* - * Now set up the timer for real. + * Hack: In case of kdump, after a crash, kernel might be booting + * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid + * might be zero if read from MP tables. Get it from LAPIC. */ - setup_APIC_timer(); - - local_irq_restore(flags); -} - -void __devinit setup_secondary_APIC_clock(void) -{ - setup_APIC_timer(); -} - -void disable_APIC_timer(void) -{ - if (using_apic_timer) { - unsigned long v; - - v = apic_read(APIC_LVTT); - /* - * When an illegal vector value (0-15) is written to an LVT - * entry and delivery mode is Fixed, the APIC may signal an - * illegal vector error, with out regard to whether the mask - * bit is set or whether an interrupt is actually seen on input. - * - * Boot sequence might call this function when the LVTT has - * '0' vector value. So make sure vector field is set to - * valid value. - */ - v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write_around(APIC_LVTT, v); - } -} - -void enable_APIC_timer(void) -{ - int cpu = smp_processor_id(); - - if (using_apic_timer && - !cpu_isset(cpu, timer_bcast_ipi)) { - unsigned long v; - - v = apic_read(APIC_LVTT); - apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED); - } -} - -void switch_APIC_timer_to_ipi(void *cpumask) -{ - cpumask_t mask = *(cpumask_t *)cpumask; - int cpu = smp_processor_id(); - - if (cpu_isset(cpu, mask) && - !cpu_isset(cpu, timer_bcast_ipi)) { - disable_APIC_timer(); - cpu_set(cpu, timer_bcast_ipi); -#ifdef CONFIG_HIGH_RES_TIMERS - /* - * C3 stops the local apic timer. We can not make high - * resolution timers and dynamic ticks work with one global - * timer. Disable the NEXTEVT capability, so high resolution / - * dyntick mode gets disabled too. - * - * There is a solution for this problem, but this is beyond the - * scope of this initial patchset: - * - * When the local apic timer is unusable in C3, then we can - * utilize the PIT to provide a global wakeup, which can be - * directed to the CPU which has the earliest wakeup - * point. Once the CPU is up again, the local apic is resumed - * and can be used for the per cpu clock events again. It's not - * hard to provide the infrastructure, but I need more insight - * into the ACPI code to get it right. - * - * Disable the highres/dyntick feature in this case for now, - * until somebody beats the ACPI clue into me. :) - * - * tglx - */ - printk("Disabling NO_HZ and high resolution timers " - "due to timer broadcasting (C3 stops local apic)\n"); - for_each_possible_cpu(cpu) - per_cpu(lapic_events, cpu).capabilities &= - ~CLOCK_CAP_NEXTEVT; +#ifdef CONFIG_CRASH_DUMP + boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); #endif - } -} -EXPORT_SYMBOL(switch_APIC_timer_to_ipi); - -void switch_ipi_to_APIC_timer(void *cpumask) -{ - cpumask_t mask = *(cpumask_t *)cpumask; - int cpu = smp_processor_id(); - - if (cpu_isset(cpu, mask) && - cpu_isset(cpu, timer_bcast_ipi)) { - cpu_clear(cpu, timer_bcast_ipi); - enable_APIC_timer(); - } -} -EXPORT_SYMBOL(switch_ipi_to_APIC_timer); - -#undef APIC_DIVISOR + phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); -/* - * Local timer interrupt handler. It does both profiling and - * process statistics/rescheduling. - * - * We do profiling in every local tick, statistics/rescheduling - * happen only every 'profiling multiplier' ticks. The default - * multiplier is 1 and it can be changed by writing the new multiplier - * value into /proc/profile. - */ + setup_local_APIC(); -inline void smp_local_timer_interrupt(void) -{ - profile_tick(CPU_PROFILING); -#ifdef CONFIG_SMP - update_process_times(user_mode_vm(get_irq_regs())); +#ifdef CONFIG_X86_IO_APIC + if (smp_found_config) + if (!skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); #endif + setup_boot_APIC_clock(); - /* - * We take the 'long' return path, and there every subsystem - * grabs the apropriate locks (kernel lock/ irq lock). - * - * we might want to decouple profiling from the 'long path', - * and do the profiling totally in assembly. - * - * Currently this isn't too much of an issue (performance wise), - * we can take more than 100K local irqs per second on a 100 MHz P5. - */ + return 0; } /* - * Local APIC timer interrupt. This is the most natural way for doing - * local interrupts, but local timer interrupts can be emulated by - * broadcast interrupts too. [in case the hw doesn't support APIC timers] - * - * [ if a single-CPU system runs an SMP kernel then we call the local - * interrupt as well. Thus we cannot inline the local irq ... ] + * APIC command line parameters */ - -fastcall void smp_apic_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - int cpu = smp_processor_id(); - struct clock_event_device *evt = &per_cpu(lapic_events, cpu); - - /* - * the NMI deadlock-detector uses this. - */ - per_cpu(irq_stat, cpu).apic_timer_irqs++; - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - */ - ack_APIC_irq(); - /* - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ - irq_enter(); - evt->event_handler(regs); - irq_exit(); - set_irq_regs(old_regs); -} - -#ifndef CONFIG_SMP -static void up_apic_timer_interrupt_call(void) +static int __init parse_lapic(char *arg) { - int cpu = smp_processor_id(); - - /* - * the NMI deadlock-detector uses this. - */ - per_cpu(irq_stat, cpu).apic_timer_irqs++; - - smp_local_timer_interrupt(); + enable_local_apic = 1; + return 0; } -#endif - -void smp_send_timer_broadcast_ipi(void) -{ - cpumask_t mask; - - cpus_and(mask, cpu_online_map, timer_bcast_ipi); - if (!cpus_empty(mask)) { -#ifdef CONFIG_SMP - send_IPI_mask(mask, LOCAL_TIMER_VECTOR); -#else - /* - * We can directly call the apic timer interrupt handler - * in UP case. Minus all irq related functions - */ - up_apic_timer_interrupt_call(); -#endif - } +early_param("lapic", parse_lapic); + +static int __init parse_nolapic(char *arg) +{ + enable_local_apic = -1; + clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); + return 0; } +early_param("nolapic", parse_nolapic); -int setup_profiling_timer(unsigned int multiplier) +static int __init apic_set_verbosity(char *str) { - return -EINVAL; + if (strcmp("debug", str) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", str) == 0) + apic_verbosity = APIC_VERBOSE; + return 1; } +__setup("apic=", apic_set_verbosity); + + +/* + * Local APIC interrupts + */ + /* * This interrupt should _never_ happen with our APIC/SMP architecture */ @@ -1398,15 +1349,14 @@ fastcall void smp_spurious_interrupt(str ack_APIC_irq(); /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n", - smp_processor_id()); + printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " + "should never happen.\n", smp_processor_id()); irq_exit(); } /* * This interrupt should never happen with our APIC/SMP architecture */ - fastcall void smp_error_interrupt(struct pt_regs *regs) { unsigned long v, v1; @@ -1430,69 +1380,247 @@ fastcall void smp_error_interrupt(struct 7: Illegal register address */ printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", - smp_processor_id(), v , v1); + smp_processor_id(), v , v1); irq_exit(); } /* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. + * Initialize APIC interrupts */ -int __init APIC_init_uniprocessor (void) +void __init apic_intr_init(void) { - if (enable_local_apic < 0) - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); +#ifdef CONFIG_SMP + smp_intr_init(); +#endif + /* self generated IPI for local APIC timer */ + set_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - if (!smp_found_config && !cpu_has_apic) - return -1; + /* IPI vectors for APIC spurious and error interrupts */ + set_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + set_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - /* - * Complain if the BIOS pretends there is one. - */ - if (!cpu_has_apic && APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_physical_apicid); - clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); - return -1; + /* thermal monitor LVT interrupt */ +#ifdef CONFIG_X86_MCE_P4THERMAL + set_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +#endif +} + +/** + * connect_bsp_APIC - attach the APIC to the interrupt system + */ +void __init connect_bsp_APIC(void) +{ + if (pic_mode) { + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + /* + * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's + * local APIC to INT and NMI lines. + */ + apic_printk(APIC_VERBOSE, "leaving PIC mode, " + "enabling APIC mode.\n"); + outb(0x70, 0x22); + outb(0x01, 0x23); } + enable_apic_mode(); +} - verify_local_APIC(); +/** + * disconnect_bsp_APIC - detach the APIC from the interrupt system + * @virt_wire_setup: indicates, whether virtual wire mode is selected + * + * Virtual wire mode is necessary to deliver legacy interrupts even when the + * APIC is disabled. + */ +void disconnect_bsp_APIC(int virt_wire_setup) +{ + if (pic_mode) { + /* + * Put the board back into PIC mode (has an effect only on + * certain older boards). Note that APIC interrupts, including + * IPIs, won't work beyond this point! The only exception are + * INIT IPIs. + */ + apic_printk(APIC_VERBOSE, "disabling APIC mode, " + "entering PIC mode.\n"); + outb(0x70, 0x22); + outb(0x00, 0x23); + } else { + /* Go back to Virtual Wire compatibility mode */ + unsigned long value; - connect_bsp_APIC(); + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write_around(APIC_SPIV, value); - /* - * Hack: In case of kdump, after a crash, kernel might be booting - * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid - * might be zero if read from MP tables. Get it from LAPIC. - */ -#ifdef CONFIG_CRASH_DUMP - boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); -#endif - phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); + if (!virt_wire_setup) { + /* + * For LVT0 make it edge triggered, active high, + * external and enabled + */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED ); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write_around(APIC_LVT0, value); + } else { + /* Disable LVT0 */ + apic_write_around(APIC_LVT0, APIC_LVT_MASKED); + } - setup_local_APIC(); + /* + * For LVT1 make it edge triggered, active high, nmi and + * enabled + */ + value = apic_read(APIC_LVT1); + value &= ~( + APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write_around(APIC_LVT1, value); + } +} -#ifdef CONFIG_X86_IO_APIC - if (smp_found_config) - if (!skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); -#endif - setup_boot_APIC_clock(); +/* + * Power management + */ +#ifdef CONFIG_PM + +static struct { + int active; + /* r/w apic fields */ + unsigned int apic_id; + unsigned int apic_taskpri; + unsigned int apic_ldr; + unsigned int apic_dfr; + unsigned int apic_spiv; + unsigned int apic_lvtt; + unsigned int apic_lvtpc; + unsigned int apic_lvt0; + unsigned int apic_lvt1; + unsigned int apic_lvterr; + unsigned int apic_tmict; + unsigned int apic_tdcr; + unsigned int apic_thmr; +} apic_pm_state; + +static int lapic_suspend(struct sys_device *dev, pm_message_t state) +{ + unsigned long flags; + + if (!apic_pm_state.active) + return 0; + + apic_pm_state.apic_id = apic_read(APIC_ID); + apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); + apic_pm_state.apic_ldr = apic_read(APIC_LDR); + apic_pm_state.apic_dfr = apic_read(APIC_DFR); + apic_pm_state.apic_spiv = apic_read(APIC_SPIV); + apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); + apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); + apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); + apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); + apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); + apic_pm_state.apic_tmict = apic_read(APIC_TMICT); + apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); + apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); + local_irq_save(flags); + disable_local_APIC(); + local_irq_restore(flags); return 0; } -static int __init parse_lapic(char *arg) +static int lapic_resume(struct sys_device *dev) { - lapic_enable(); + unsigned int l, h; + unsigned long flags; + + if (!apic_pm_state.active) + return 0; + + local_irq_save(flags); + + /* + * Make sure the APICBASE points to the right address + * + * FIXME! This will be wrong if we ever support suspend on + * SMP! We'll need to do this as part of the CPU restore! + */ + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; + wrmsr(MSR_IA32_APICBASE, l, h); + + apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); + apic_write(APIC_ID, apic_pm_state.apic_id); + apic_write(APIC_DFR, apic_pm_state.apic_dfr); + apic_write(APIC_LDR, apic_pm_state.apic_ldr); + apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); + apic_write(APIC_SPIV, apic_pm_state.apic_spiv); + apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); + apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); + apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); + apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); + apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); + apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); + apic_write(APIC_TMICT, apic_pm_state.apic_tmict); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + local_irq_restore(flags); return 0; } -early_param("lapic", parse_lapic); -static int __init parse_nolapic(char *arg) +/* + * This device has no shutdown method - fully functioning local APICs + * are needed on every CPU up until machine_halt/restart/poweroff. + */ + +static struct sysdev_class lapic_sysclass = { + set_kset_name("lapic"), + .resume = lapic_resume, + .suspend = lapic_suspend, +}; + +static struct sys_device device_lapic = { + .id = 0, + .cls = &lapic_sysclass, +}; + +static void __devinit apic_pm_activate(void) { - lapic_disable(); - return 0; + apic_pm_state.active = 1; } -early_param("nolapic", parse_nolapic); +static int __init init_lapic_sysfs(void) +{ + int error; + + if (!cpu_has_apic) + return 0; + /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ + + error = sysdev_class_register(&lapic_sysclass); + if (!error) + error = sysdev_register(&device_lapic); + return error; +} +device_initcall(init_lapic_sysfs); + +#else /* CONFIG_PM */ + +static void apic_pm_activate(void) { } + +#endif /* CONFIG_PM */ diff -puN arch/i386/kernel/i8253.c~lapic-horror arch/i386/kernel/i8253.c --- a/arch/i386/kernel/i8253.c~lapic-horror +++ a/arch/i386/kernel/i8253.c @@ -85,10 +85,7 @@ static void pit_next_event(unsigned long struct clock_event_device pit_clockevent = { .name = "pit", .capabilities = CLOCK_CAP_TICK | CLOCK_CAP_PROFILE | CLOCK_CAP_UPDATE -#ifndef CONFIG_SMP - | CLOCK_CAP_NEXTEVT -#endif - , + | CLOCK_CAP_NEXTEVT, .set_mode = init_pit_timer, .set_next_event = pit_next_event, .shift = 32, diff -puN arch/i386/kernel/io_apic.c~lapic-horror arch/i386/kernel/io_apic.c --- a/arch/i386/kernel/io_apic.c~lapic-horror +++ a/arch/i386/kernel/io_apic.c @@ -1533,7 +1533,7 @@ void /*__init*/ print_local_APIC(void * v = apic_read(APIC_LVR); printk(KERN_INFO "... APIC VERSION: %08x\n", v); ver = GET_APIC_VERSION(v); - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); v = apic_read(APIC_TASKPRI); printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); diff -puN arch/i386/kernel/irq.c~lapic-horror arch/i386/kernel/irq.c --- a/arch/i386/kernel/irq.c~lapic-horror +++ a/arch/i386/kernel/irq.c @@ -10,7 +10,6 @@ * io_apic.c.) */ -#include <asm/uaccess.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/interrupt.h> @@ -19,19 +18,34 @@ #include <linux/cpu.h> #include <linux/delay.h> +#include <asm/apic.h> +#include <asm/uaccess.h> + DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; EXPORT_PER_CPU_SYMBOL(irq_stat); -#ifndef CONFIG_X86_LOCAL_APIC /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. */ void ack_bad_irq(unsigned int irq) { - printk("unexpected IRQ trap at vector %02x\n", irq); -} + printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); + +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But only ack when the APIC is enabled -AK + */ + if (cpu_has_apic) + ack_APIC_irq(); #endif +} #ifdef CONFIG_4KSTACKS /* diff -puN arch/i386/kernel/smpboot.c~lapic-horror arch/i386/kernel/smpboot.c --- a/arch/i386/kernel/smpboot.c~lapic-horror +++ a/arch/i386/kernel/smpboot.c @@ -433,9 +433,7 @@ static void __devinit smp_callin(void) /* * Save our processor parameters */ - smp_store_cpu_info(cpuid); - - disable_APIC_timer(); + smp_store_cpu_info(cpuid); /* * Allow the master to continue. @@ -552,7 +550,6 @@ static void __devinit start_secondary(vo enable_NMI_through_LVT0(NULL); enable_8259A_irq(0); } - enable_APIC_timer(); /* * low-memory mappings have been cleared, flush them from * the local TLBs too. @@ -739,7 +736,7 @@ wakeup_secondary_cpu(int logical_apicid, /* * Due to the Pentium erratum 3AP. */ - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) { apic_read_around(APIC_SPIV); apic_write(APIC_ESR, 0); @@ -829,7 +826,7 @@ wakeup_secondary_cpu(int phys_apicid, un */ Dprintk("#startup loops: %d.\n", num_starts); - maxlvt = get_maxlvt(); + maxlvt = lapic_get_maxlvt(); for (j = 1; j <= num_starts; j++) { Dprintk("Sending STARTUP #%d.\n",j); diff -puN arch/i386/kernel/time.c~lapic-horror arch/i386/kernel/time.c --- a/arch/i386/kernel/time.c~lapic-horror +++ a/arch/i386/kernel/time.c @@ -195,11 +195,6 @@ irqreturn_t timer_interrupt(int irq, voi outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ } -#ifdef CONFIG_X86_LOCAL_APIC - if (using_apic_timer) - smp_send_timer_broadcast_ipi(); -#endif - return IRQ_HANDLED; } diff -puN arch/i386/kernel/time_hpet.c~lapic-horror arch/i386/kernel/time_hpet.c --- a/arch/i386/kernel/time_hpet.c~lapic-horror +++ a/arch/i386/kernel/time_hpet.c @@ -43,23 +43,6 @@ static void hpet_writel(unsigned long d, writel(d, hpet_virt_address + a); } -#ifdef CONFIG_X86_LOCAL_APIC -/* - * HPET counters dont wrap around on every tick. They just change the - * comparator value and continue. Next tick can be caught by checking - * for a change in the comparator value. Used in apic.c. - */ -static void __devinit wait_hpet_tick(void) -{ - unsigned int start_cmp_val, end_cmp_val; - - start_cmp_val = hpet_readl(HPET_T0_CMP); - do { - end_cmp_val = hpet_readl(HPET_T0_CMP); - } while (start_cmp_val == end_cmp_val); -} -#endif - static int hpet_timer_stop_set_go(unsigned long tick) { unsigned int cfg; @@ -213,11 +196,6 @@ int __init hpet_enable(void) hpet_alloc(&hd); } #endif - -#ifdef CONFIG_X86_LOCAL_APIC - if (hpet_use_timer) - wait_timer_tick = wait_hpet_tick; -#endif return 0; } diff -puN drivers/acpi/processor_idle.c~lapic-horror drivers/acpi/processor_idle.c --- a/drivers/acpi/processor_idle.c~lapic-horror +++ a/drivers/acpi/processor_idle.c @@ -236,6 +236,165 @@ static void acpi_cstate_enter(struct acp } } +#ifdef ARCH_APICTIMER_STOPS_ON_C3 + +/* + * Some BIOS implementations switch to C3 in the published C2 state. This seems + * to be a common problem on AMD boxen. Anyway we do not trust the BIOS at all. + */ +static void acpi_timer_check_state(int state, struct acpi_processor *pr) +{ + struct acpi_processor_power *pwr = &pr->power; + + /* + * Check, if one of the previous state already marked the lapic + * unstable + */ + if (pwr->timer_broadcast_on_state < state) + return; + +#ifdef CONFIG_X86_64 + /* Keep this until we converted x86_64 */ + if(state == ACPI_STATE_C3 || + boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + pr->power.timer_broadcast_on_state = state; + return; + } +#endif + /* + * Did we detect this state as unstable earlier ? + */ + if (pwr->timer_state_unstable == state) + pr->power.timer_broadcast_on_state = state; +} + +static void acpi_propagate_timer_broadcast(struct acpi_processor *pr) +{ + cpumask_t mask = cpumask_of_cpu(pr->id); + + if (pr->power.timer_broadcast_on_state < INT_MAX) + on_each_cpu(switch_APIC_timer_to_ipi, &mask, 1, 1); + else + on_each_cpu(switch_ipi_to_APIC_timer, &mask, 1, 1); +} + +/* Power(C) State timer broadcast control */ +static void acpi_state_timer_broadcast(struct acpi_processor *pr, + struct acpi_processor_cx *cx, + int broadcast) +{ + int state = cx - pr->power.states; + + if (state >= pr->power.timer_broadcast_on_state) { + lapic_timer_idle_broadcast(broadcast); + return; + } + + /* + * FIXME: Initialize this when the data structure is created ! + */ + if (!pr->power.timer_state_unstable) + pr->power.timer_state_unstable = INT_MAX; + + /* + * On cstate entry we verify once, if the lapic is stable + */ + if (broadcast && pr->power.timer_state_unstable > state && + pr->power.timer_state_verified < state) + lapic_timer_idle_verify(0); +} + +/* C-State timer verification */ +static void acpi_state_timer_verify(struct acpi_processor *pr, + struct acpi_processor_cx *cx, + uint32_t ticks) +{ + struct acpi_processor_power *pwr = &pr->power; + int state = cx - pr->power.states; + + /* + * On cstate exit we verify, if the lapic is stable + */ + if (pwr->timer_state_unstable > state && + pwr->timer_state_verified < state) { + int res = lapic_timer_idle_verify(ticks); + + switch (res) { + /* No decision made (time too short, multiple wraps) */ + case 0: break; + + /* Verification result: lapic timer unstable */ + case -1: + if (cx->timer_verify-- == -50) { + pwr->timer_state_unstable = state; + printk(KERN_INFO "ACPI: lapic stops on CPU " + "%d: state %d type %d idx %d\n", + smp_processor_id(), state, cx->type, + cx->index); + if (cx->type != ACPI_STATE_C3) + printk(KERN_WARNING "ACPI: lapic " + "stops in C%d state type\n", + cx->type); + if (state < pwr->timer_broadcast_on_state) { + pwr->timer_broadcast_on_state = state; + acpi_propagate_timer_broadcast(pr); + } + } + break; + /* Verification result: lapic timer stable */ + case 1: + if (cx->timer_verify++ == 50) { + pr->power.timer_state_verified = state; + printk(KERN_INFO "ACPI: lapic OK on CPU %d: " + "state %d type %d idx %d\n", + smp_processor_id(), state, cx->type, + cx->index); + } + break; + } + } +} +/* + * Block the state promotion long enough to verify the timer + */ +static int acpi_state_timer_verify_done(struct acpi_processor *pr, + struct acpi_processor_cx *cx) +{ + struct acpi_processor_power *pwr = &pr->power; + int state = cx - pr->power.states; + + /* + * Allow promotion, when either: + * + * - the state is C1 + * - the current state is >= the known start of unstable states + * - the current state is <= the known verified stable states + */ + return (cx->type == ACPI_STATE_C1 || + (pwr->timer_state_unstable <= state || + pwr->timer_state_verified >= state)); +} + +#else +static void acpi_timer_check_state(int state, struct acpi_processor *pr) { } +static void acpi_propagate_timer_broadcast(struct acpi_processor *pr) { } +static void acpi_state_timer_broadcast(struct acpi_processor *pr, + struct acpi_processor_cx *cx, + int broadcast) +{ +} +static void acpi_state_timer_verify(struct acpi_processor *pr, + struct acpi_processor_cx *cx, + uint32_t ticks) +{ +} +static int acpi_state_timer_verify_done(struct acpi_processor *pr, + struct acpi_processor_cx *cx) +{ + return 1; +} +#endif + static void acpi_processor_idle(void) { struct acpi_processor *pr = NULL; @@ -378,9 +537,11 @@ static void acpi_processor_idle(void) /* Get start time (ticks) */ t1 = inl(acpi_fadt.xpm_tmr_blk.address); /* Invoke C2 */ + acpi_state_timer_broadcast(pr, cx, 1); acpi_cstate_enter(cx); /* Get end time (ticks) */ t2 = inl(acpi_fadt.xpm_tmr_blk.address); + acpi_state_timer_verify(pr, cx, ticks_elapsed(t1, t2)); #ifdef CONFIG_GENERIC_TIME /* TSC halts in C2, so notify users */ @@ -392,6 +553,7 @@ static void acpi_processor_idle(void) /* Compute time (ticks) that we were actually asleep */ sleep_ticks = ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD; + acpi_state_timer_broadcast(pr, cx, 0); break; case ACPI_STATE_C3: @@ -414,6 +576,7 @@ static void acpi_processor_idle(void) /* Get start time (ticks) */ t1 = inl(acpi_fadt.xpm_tmr_blk.address); /* Invoke C3 */ + acpi_state_timer_broadcast(pr, cx, 1); acpi_cstate_enter(cx); /* Get end time (ticks) */ t2 = inl(acpi_fadt.xpm_tmr_blk.address); @@ -424,6 +587,7 @@ static void acpi_processor_idle(void) ACPI_MTX_DO_NOT_LOCK); } + acpi_state_timer_verify(pr, cx, ticks_elapsed(t1, t2)); #ifdef CONFIG_GENERIC_TIME /* TSC halts in C3, so notify users */ mark_tsc_unstable(); @@ -434,6 +598,7 @@ static void acpi_processor_idle(void) /* Compute time (ticks) that we were actually asleep */ sleep_ticks = ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD; + acpi_state_timer_broadcast(pr, cx, 0); break; default: @@ -469,7 +634,8 @@ static void acpi_processor_idle(void) cx->promotion.state->latency <= system_latency_constraint()) { cx->promotion.count++; cx->demotion.count = 0; - if (cx->promotion.count >= + if (acpi_state_timer_verify_done(pr, cx) && + cx->promotion.count >= cx->promotion.threshold.count) { if (pr->flags.bm_check) { if (! @@ -902,11 +1068,7 @@ static int acpi_processor_power_verify(s unsigned int i; unsigned int working = 0; -#ifdef ARCH_APICTIMER_STOPS_ON_C3 - int timer_broadcast = 0; - cpumask_t mask = cpumask_of_cpu(pr->id); - on_each_cpu(switch_ipi_to_APIC_timer, &mask, 1, 1); -#endif + pr->power.timer_broadcast_on_state = INT_MAX; for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) { struct acpi_processor_cx *cx = &pr->power.states[i]; @@ -918,21 +1080,14 @@ static int acpi_processor_power_verify(s case ACPI_STATE_C2: acpi_processor_power_verify_c2(cx); -#ifdef ARCH_APICTIMER_STOPS_ON_C3 - /* Some AMD systems fake C3 as C2, but still - have timer troubles */ - if (cx->valid && - boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - timer_broadcast++; -#endif + if (cx->valid) + acpi_timer_check_state(i, pr); break; case ACPI_STATE_C3: acpi_processor_power_verify_c3(pr, cx); -#ifdef ARCH_APICTIMER_STOPS_ON_C3 if (cx->valid) - timer_broadcast++; -#endif + acpi_timer_check_state(i, pr); break; } @@ -940,10 +1095,7 @@ static int acpi_processor_power_verify(s working++; } -#ifdef ARCH_APICTIMER_STOPS_ON_C3 - if (timer_broadcast) - on_each_cpu(switch_APIC_timer_to_ipi, &mask, 1, 1); -#endif + acpi_propagate_timer_broadcast(pr); return (working); } diff -puN drivers/clocksource/acpi_pm.c~lapic-horror drivers/clocksource/acpi_pm.c --- a/drivers/clocksource/acpi_pm.c~lapic-horror +++ a/drivers/clocksource/acpi_pm.c @@ -16,15 +16,13 @@ * This file is licensed under the GPL v2. */ +#include <linux/acpi_pmtmr.h> #include <linux/clocksource.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/pci.h> #include <asm/io.h> -/* Number of PMTMR ticks expected during calibration run */ -#define PMTMR_TICKS_PER_SEC 3579545 - /* * The I/O port the PMTMR resides at. * The location is detected during setup_arch(), @@ -32,15 +30,13 @@ */ u32 pmtmr_ioport __read_mostly; -#define ACPI_PM_MASK CLOCKSOURCE_MASK(24) /* limit it to 24 bits */ - static inline u32 read_pmtmr(void) { /* mask the output to 24 bits */ return inl(pmtmr_ioport) & ACPI_PM_MASK; } -static cycle_t acpi_pm_read_verified(void) +u32 acpi_pm_read_verified(void) { u32 v1 = 0, v2 = 0, v3 = 0; @@ -57,7 +53,12 @@ static cycle_t acpi_pm_read_verified(voi } while (unlikely((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1) || (v3 > v1 && v3 < v2))); - return (cycle_t)v2; + return v2; +} + +static cycle_t acpi_pm_read_slow(void) +{ + return (cycle_t)acpi_pm_read_verified(); } static cycle_t acpi_pm_read(void) @@ -87,7 +88,7 @@ __setup("acpi_pm_good", acpi_pm_good_set static inline void acpi_pm_need_workaround(void) { - clocksource_acpi_pm.read = acpi_pm_read_verified; + clocksource_acpi_pm.read = acpi_pm_read_slow; clocksource_acpi_pm.rating = 110; } diff -puN include/acpi/processor.h~lapic-horror include/acpi/processor.h --- a/include/acpi/processor.h~lapic-horror +++ a/include/acpi/processor.h @@ -67,6 +67,7 @@ struct acpi_processor_cx { u32 latency_ticks; u32 power; u32 usage; + s32 timer_verify; u64 time; struct acpi_processor_cx_policy promotion; struct acpi_processor_cx_policy demotion; @@ -79,6 +80,9 @@ struct acpi_processor_power { u32 bm_activity; int count; struct acpi_processor_cx states[ACPI_PROCESSOR_MAX_POWER]; + int timer_broadcast_on_state; + int timer_state_verified; + int timer_state_unstable; }; /* Performance Management */ diff -puN include/asm-i386/apic.h~lapic-horror include/asm-i386/apic.h --- a/include/asm-i386/apic.h~lapic-horror +++ a/include/asm-i386/apic.h @@ -84,9 +84,7 @@ static inline void ack_APIC_irq(void) apic_write_around(APIC_EOI, 0); } -extern void (*wait_timer_tick)(void); - -extern int get_maxlvt(void); +extern int lapic_get_maxlvt(void); extern void clear_local_APIC(void); extern void connect_bsp_APIC (void); extern void disconnect_bsp_APIC (int virt_wire_setup); @@ -102,12 +100,10 @@ extern void smp_local_timer_interrupt (v extern void setup_boot_APIC_clock (void); extern void setup_secondary_APIC_clock (void); extern int APIC_init_uniprocessor (void); -extern void disable_APIC_timer(void); -extern void enable_APIC_timer(void); - +extern void lapic_timer_idle_broadcast(int broadcast); +extern int lapic_timer_idle_verify(unsigned long ticks); extern void enable_NMI_through_LVT0 (void * dummy); -void smp_send_timer_broadcast_ipi(void); void switch_APIC_timer_to_ipi(void *cpumask); void switch_ipi_to_APIC_timer(void *cpumask); #define ARCH_APICTIMER_STOPS_ON_C3 1 diff -puN include/asm-i386/mpspec.h~lapic-horror include/asm-i386/mpspec.h --- a/include/asm-i386/mpspec.h~lapic-horror +++ a/include/asm-i386/mpspec.h @@ -23,7 +23,6 @@ extern struct mpc_config_intsrc mp_irqs extern int mpc_default_type; extern unsigned long mp_lapic_addr; extern int pic_mode; -extern int using_apic_timer; #ifdef CONFIG_ACPI extern void mp_register_lapic (u8 id, u8 enabled); diff -puN include/asm-x86_64/apic.h~lapic-horror include/asm-x86_64/apic.h --- a/include/asm-x86_64/apic.h~lapic-horror +++ a/include/asm-x86_64/apic.h @@ -87,6 +87,8 @@ extern void clustered_apic_check(void); extern void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector, unsigned char msg_type, unsigned char mask); +static inline void lapic_timer_idle_broadcast(int broadcast) { } +static inline int lapic_timer_idle_verify(unsigned long ticks) { } #define K8_APIC_EXT_LVT_BASE 0x500 #define K8_APIC_EXT_INT_MSG_FIX 0x0 diff -puN /dev/null include/linux/acpi_pmtmr.h --- /dev/null +++ a/include/linux/acpi_pmtmr.h @@ -0,0 +1,37 @@ +#ifndef _ACPI_PMTMR_H_ +#define _ACPI_PMTMR_H_ + +#include <linux/clocksource.h> + +#ifdef CONFIG_X86_PM_TIMER +/* Number of PMTMR ticks expected during calibration run */ +#define PMTMR_TICKS_PER_SEC 3579545 + +/* limit it to 24 bits */ +#define ACPI_PM_MASK CLOCKSOURCE_MASK(24) + +/* Overrun value */ +#define ACPI_PM_OVRRUN 1<<24 + +extern u32 acpi_pm_read_verified(void); +extern u32 pmtmr_ioport; + +static inline u32 acpi_pm_read_early(void) +{ + if (!pmtmr_ioport) + return 0; + /* mask the output to 24 bits */ + return acpi_pm_read_verified(); +} + +#else + +static inline u32 acpi_pm_read_early(void) +{ + return 0; +} + +#endif + +#endif + diff -puN include/linux/clockchips.h~lapic-horror include/linux/clockchips.h --- a/include/linux/clockchips.h~lapic-horror +++ a/include/linux/clockchips.h @@ -114,9 +114,26 @@ extern int clockevents_set_next_event(kt extern int clockevents_next_event_available(void); extern void clockevents_resume_events(void); +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern void clockevents_set_broadcast(struct clock_event_device *evt, + int broadcast); +extern void clockevents_set_global_broadcast(struct clock_event_device *evt, + int broadcast); +extern int clockevents_register_broadcast(void (*fun)(cpumask_t *mask)); #else +static inline void clockevents_set_broadcast(struct clock_event_device *, int) +{ +} +#endif + +#else + # define clockevents_init() do { } while(0) # define clockevents_resume_events() do { } while(0) +static inline void clockevents_set_broadcast(struct clock_event_device *, int) +{ +} + #endif #endif diff -puN kernel/hrtimer.c~lapic-horror kernel/hrtimer.c --- a/kernel/hrtimer.c~lapic-horror +++ a/kernel/hrtimer.c @@ -314,7 +314,7 @@ static int __init setup_hrtimer_hres(cha return 1; } -__setup("highres", setup_hrtimer_hres); +__setup("highres=", setup_hrtimer_hres); /* * Is the high resolution mode active ? @@ -966,6 +966,12 @@ static inline void hrtimer_resume_jiffy_ #endif /* CONFIG_HIGH_RES_TIMERS */ +static inline int hrtimer_is_queued(struct hrtimer *timer) +{ + return timer->state != HRTIMER_STATE_INACTIVE && + timer->state != HRTIMER_STATE_CALLBACK; +} + #ifdef CONFIG_TIMER_STATS void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr) { @@ -1135,7 +1141,7 @@ static void __remove_hrtimer(struct hrti static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) { - if (hrtimer_active(timer)) { + if (hrtimer_is_queued(timer)) { int reprogram; /* diff -puN kernel/time/Kconfig~lapic-horror kernel/time/Kconfig --- a/kernel/time/Kconfig~lapic-horror +++ a/kernel/time/Kconfig @@ -3,7 +3,7 @@ # config HIGH_RES_TIMERS bool "High Resolution Timer Support" - depends on GENERIC_TIME + depends on GENERIC_TIME && GENERIC_CLOCKEVENTS help This option enables high resolution timer support. If your hardware is not capable then this option only increases @@ -11,7 +11,7 @@ config HIGH_RES_TIMERS config NO_HZ bool "Tickless System (Dynamic Ticks)" - depends on GENERIC_TIME && HIGH_RES_TIMERS + depends on HIGH_RES_TIMERS help This option enables a tickless system: timer interrupts will only trigger on an as-needed basis both when the system is diff -puN kernel/time/clockevents.c~lapic-horror kernel/time/clockevents.c --- a/kernel/time/clockevents.c~lapic-horror +++ a/kernel/time/clockevents.c @@ -49,6 +49,7 @@ struct local_events { int installed; struct event_descr events[MAX_CLOCK_EVENTS]; struct clock_event_device *nextevt; + ktime_t expires_next; }; /* Variables related to the global event device */ @@ -66,6 +67,12 @@ static DEFINE_SPINLOCK(events_lock); /* Variables related to the per cpu local event devices */ static DEFINE_PER_CPU(struct local_events, local_eventdevices); +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +static void clockevents_check_broadcast(struct event_descr *descr); +#else +static inline void clockevents_check_broadcast(struct event_descr *descr) { } +#endif + /* * Math helper. Convert a latch value (device ticks) to nanoseconds */ @@ -254,14 +261,14 @@ int __init register_global_clockevent(st return -EINVAL; } -#ifdef CONFIG_SMP /* * On UP systems the global clock event device can be used as the next * event device. On SMP this is disabled because the next event device * must be per CPU. */ - evt->capabilities &= ~CLOCK_CAP_NEXTEVT; -#endif + if (num_possible_cpus() > 1) + evt->capabilities &= ~CLOCK_CAP_NEXTEVT; + /* Mask out high resolution capabilities for now */ global_eventdevice.event = evt; @@ -301,6 +308,7 @@ static void recalc_active_event(struct e descr->event->name); } descr->real_caps = caps; + clockevents_check_broadcast(descr); } /* @@ -464,6 +472,24 @@ int clockevents_init_next_event(void) return ret; } +/* + * Reprogram the clock event device. Internal helper function + */ +static void do_clockevents_set_next_event(struct clock_event_device *nextevt, + int64_t delta) +{ + unsigned long long clc; + + if (delta > nextevt->max_delta_ns) + delta = nextevt->max_delta_ns; + if (delta < nextevt->min_delta_ns) + delta = nextevt->min_delta_ns; + + clc = delta * nextevt->mult; + clc >>= nextevt->shift; + nextevt->set_next_event((unsigned long)clc, nextevt); +} + /** * clockevents_set_next_event - Reprogram the clock event device. * @expires: absolute expiry time (monotonic clock) @@ -472,29 +498,186 @@ int clockevents_init_next_event(void) * * Returns 0 on success, -ETIME when the event is in the past and force is not * set. + * Called with interrupts disabled. */ int clockevents_set_next_event(ktime_t expires, int force) { struct local_events *devices = &__get_cpu_var(local_eventdevices); int64_t delta = ktime_to_ns(ktime_sub(expires, ktime_get())); struct clock_event_device *nextevt = devices->nextevt; - unsigned long long clc; - if (delta <= 0 && !force) + if (delta <= 0 && !force) { + devices->expires_next.tv64 = KTIME_MAX; return -ETIME; + } - if (delta > nextevt->max_delta_ns) - delta = nextevt->max_delta_ns; - if (delta < nextevt->min_delta_ns) - delta = nextevt->min_delta_ns; + devices->expires_next = expires; - clc = delta * nextevt->mult; - clc >>= nextevt->shift; - nextevt->set_next_event((unsigned long)clc, devices->nextevt); + do_clockevents_set_next_event(nextevt, delta); return 0; } +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST + +static cpumask_t global_event_broadcast; +static cpumask_t local_event_broadcast; +static void (*broadcast_function)(cpumask_t *mask); +static void (*global_event_handler)(struct pt_regs *regs); + +/** + * clockevents_set_broadcast - switch next event device from/to broadcast mode + * + * Called, when the PM code enters a state, where the next event device is + * switched off. + * + * Called with interrupts disabled ! + */ +void clockevents_set_broadcast(struct clock_event_device *evt, int broadcast) +{ + struct local_events *devices = &__get_cpu_var(local_eventdevices); + struct clock_event_device *glblevt = global_eventdevice.event; + int cpu = smp_processor_id(); + ktime_t expires = { .tv64 = KTIME_MAX }; + int64_t delta; + unsigned long flags; + + if (devices->nextevt != evt) + return; + + spin_lock_irqsave(&events_lock, flags); + + if (broadcast) { + cpu_set(cpu, local_event_broadcast); + evt->set_mode(CLOCK_EVT_SHUTDOWN, evt); + } else { + cpu_clear(cpu, local_event_broadcast); + evt->set_mode(CLOCK_EVT_ONESHOT, evt); + } + + /* Reprogram the broadcast device */ + for (cpu = first_cpu(local_event_broadcast); cpu != NR_CPUS; + cpu = next_cpu(cpu, local_event_broadcast)) { + devices = &per_cpu(local_eventdevices, cpu); + if (devices->expires_next.tv64 < expires.tv64) + expires = devices->expires_next; + } + + if (expires.tv64 != KTIME_MAX) { + delta = ktime_to_ns(ktime_sub(expires, ktime_get())); + do_clockevents_set_next_event(glblevt, delta); + } + + spin_unlock_irqrestore(&events_lock, flags); +} + +/** + * clockevents_set_global_broadcast - mark event device for global broadcast + * + * Switch an event device from / to global broadcasting. This is only relevant + * when the system has not switched to high resolution mode. + */ +void clockevents_set_global_broadcast(struct clock_event_device *evt, + int broadcast) +{ + struct local_events *devices = &__get_cpu_var(local_eventdevices); + int cpu = smp_processor_id(); + unsigned long flags; + + spin_lock_irqsave(&events_lock, flags); + + if (broadcast) { + if (!cpu_isset(cpu, global_event_broadcast)) { + cpu_set(cpu, global_event_broadcast); + if (devices->nextevt != evt) + evt->set_mode(CLOCK_EVT_SHUTDOWN, evt); + } + } else { + if (cpu_isset(cpu, global_event_broadcast)) { + cpu_clear(cpu, global_event_broadcast); + if (devices->nextevt != evt) + evt->set_mode(CLOCK_EVT_PERIODIC, evt); + } + } + + spin_unlock_irqrestore(&events_lock, flags); +} + +/* + * Broadcast tick handler: + */ +static void handle_tick_broadcast(struct pt_regs *regs) +{ + /* Call the original handler global tick handler */ + global_event_handler(regs); + broadcast_function(&global_event_broadcast); +} + +/* + * Broadcast next event handler: + */ +static void handle_nextevt_broadcast(struct pt_regs *regs) +{ + struct local_events *devices; + ktime_t now = ktime_get(); + cpumask_t mask; + int cpu; + + spin_lock(&events_lock); + /* Find all expired events */ + for (cpu = first_cpu(local_event_broadcast); cpu != NR_CPUS; + cpu = next_cpu(cpu, local_event_broadcast)) { + devices = &per_cpu(local_eventdevices, cpu); + if (devices->expires_next.tv64 <= now.tv64) + cpu_set(cpu, mask); + } + spin_unlock(&events_lock); + /* Wakeup the cpus which have an expired event */ + broadcast_function(&mask); +} + +/* + * Check, if the reconfigured event device is the global broadcast device. + * + * Called with interrupts disabled and events_lock held + */ +static void clockevents_check_broadcast(struct event_descr *descr) +{ + if (descr != &global_eventdevice) + return; + + /* The device was disabled. switch it to oneshot mode instead */ + if (!descr->real_caps) { + global_event_handler = NULL; + descr->event->set_mode(CLOCK_EVT_ONESHOT, descr->event); + descr->event->event_handler = handle_nextevt_broadcast; + } else { + global_event_handler = descr->event->event_handler; + descr->event->event_handler = handle_tick_broadcast; + } + +} + +/* + * Install a broadcast function + */ +int clockevents_register_broadcast(void (*fun)(cpumask_t *mask)) +{ + unsigned long flags; + + if (broadcast_function) + return -EBUSY; + + spin_lock_irqsave(&events_lock, flags); + broadcast_function = fun; + clockevents_check_broadcast(&global_eventdevice); + spin_unlock_irqrestore(&events_lock, flags); + + return 0; +} + +#endif + /* * Resume the cpu local clock events */ _ Patches currently in -mm which might be from tglx@xxxxxxxxxxxxx are setup_irq-better-mismatch-debugging.patch printk-timed-ratelimit.patch schedule-removal-of-futex_fd.patch gtod-exponential-update_wall_time.patch gtod-persistent-clock-support-core.patch gtod-persistent-clock-support-i386.patch time-uninline-jiffiesh.patch time-uninline-jiffiesh-fix.patch time-fix-msecs_to_jiffies-bug.patch time-fix-timeout-overflow.patch cleanup-uninline-irq_enter-and-move-it-into-a-function.patch dynticks-extend-next_timer_interrupt-to-use-a-reference-jiffie.patch dynticks-extend-next_timer_interrupt-to-use-a-reference-jiffie-remove-incorrect-warning-in-kernel-timerc.patch hrtimers-namespace-and-enum-cleanup.patch hrtimers-clean-up-locking.patch hrtimers-state-tracking.patch hrtimers-clean-up-callback-tracking.patch hrtimers-move-and-add-documentation.patch clockevents-core.patch clockevents-drivers-for-i386.patch high-res-timers-core.patch gtod-mark-tsc-unusable-for-highres-timers.patch dynticks-core.patch dynticks-add-nohz-stats-to-proc-stat.patch dynticks-i386-arch-code.patch high-res-timers-dynticks-enable-i386-support.patch debugging-feature-timer-stats.patch lapic-horror.patch round_jiffies-infrastructure.patch round_jiffies-infrastructure-fix.patch - To unsubscribe from this list: send the line "unsubscribe mm-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html