Hi Neil, (Copying Mohit, who ran these tests) TL;DR: ====== We enabled the traces like you'd suggested, but did not find anything useful. We're looking at the corresponding kernel code. Any other suggestions? Detailed description: =============== 1. All the logs under arch/x86 were enabled: [root]# cat /sys/kernel/debug/dynamic_debug/control |grep arch/x86 arch/x86/kernel/tboot.c:101 [tboot]tboot_probe p "tboot_size: 0x%x\012" arch/x86/kernel/tboot.c:100 [tboot]tboot_probe p "tboot_base: 0x%08x\012" arch/x86/kernel/tboot.c:99 [tboot]tboot_probe p "shutdown_entry: 0x%x\012" arch/x86/kernel/tboot.c:98 [tboot]tboot_probe p "log_addr: 0x%08x\012" arch/x86/kernel/tboot.c:97 [tboot]tboot_probe p "version: %d\012" arch/x86/kernel/cpu/common.c:1251 [common]cpu_init p "Initializing CPU#%d\012" arch/x86/kernel/cpu/perfctr-watchdog.c:249 [perfctr_watchdog]write_watchdog_counter p "setting %s to -0x%08Lx\012" arch/x86/kernel/cpu/perfctr-watchdog.c:260 [perfctr_watchdog]write_watchdog_counter32 p "setting %s to -0x%08Lx\012" arch/x86/kernel/acpi/boot.c:969 [boot]mp_config_acpi_legacy_irqs p "Bus #%d is ISA\012" arch/x86/kernel/smpboot.c:1238 [smpboot]native_smp_cpus_done p "Boot done.\012" arch/x86/kernel/smpboot.c:533 [smpboot]impress_friends p "Before bogocount - setting activated=1.\012" arch/x86/kernel/smpboot.c:523 [smpboot]impress_friends p "Before bogomips.\012" arch/x86/kernel/smpboot.c:975 [smpboot]native_cpu_up p "do_boot_cpu failed %d\012" arch/x86/kernel/smpboot.c:960 [smpboot]native_cpu_up p "do_boot_cpu %d Already started\012" arch/x86/kernel/smpboot.c:948 [smpboot]native_cpu_up p "++++++++++++++++++++=_---CPU UP %u\012" arch/x86/kernel/smpboot.c:898 [smpboot]do_boot_cpu p "CPU%d: has booted.\012" arch/x86/kernel/smpboot.c:877 [smpboot]do_boot_cpu p "After Callout %d.\012" arch/x86/kernel/smpboot.c:875 [smpboot]do_boot_cpu p "Before Callout %d.\012" arch/x86/kernel/smpboot.c:848 [smpboot]do_boot_cpu p "Setting warm reset code and vector.\012" arch/x86/kernel/smpboot.c:724 [smpboot]wakeup_secondary_cpu_via_init p "After Startup.\012" arch/x86/kernel/smpboot.c:710 [smpboot]wakeup_secondary_cpu_via_init p "Waiting for send to finish...\012" arch/x86/kernel/smpboot.c:708 [smpboot]wakeup_secondary_cpu_via_init p "Startup point 1.\012" arch/x86/kernel/smpboot.c:690 [smpboot]wakeup_secondary_cpu_via_init p "After apic_write.\012" arch/x86/kernel/smpboot.c:686 [smpboot]wakeup_secondary_cpu_via_init p "Sending STARTUP #%d.\012" arch/x86/kernel/smpboot.c:683 [smpboot]wakeup_secondary_cpu_via_init p "#startup loops: %d.\012" arch/x86/kernel/smpboot.c:655 [smpboot]wakeup_secondary_cpu_via_init p "Waiting for send to finish...\012" arch/x86/kernel/smpboot.c:649 [smpboot]wakeup_secondary_cpu_via_init p "Deasserting INIT.\012" arch/x86/kernel/smpboot.c:644 [smpboot]wakeup_secondary_cpu_via_init p "Waiting for send to finish...\012" arch/x86/kernel/smpboot.c:632 [smpboot]wakeup_secondary_cpu_via_init p "Asserting INIT.\012" arch/x86/kernel/smpboot.c:270 [smpboot]smp_callin p "Stack at about %p\012" arch/x86/kernel/smpboot.c:243 [smpboot]smp_callin p "CALLIN, before setup_local_APIC().\012" arch/x86/kernel/smpboot.c:208 [smpboot]smp_callin p "CPU#%d (phys ID: %d) waiting for CALLOUT\012" arch/x86/kernel/smpboot.c:604 [smpboot]wakeup_secondary_cpu_via_nmi p "NMI sent.\012" arch/x86/kernel/smpboot.c:591 [smpboot]wakeup_secondary_cpu_via_nmi p "Waiting for send to finish...\012" arch/x86/kernel/tsc_sync.c:151 [tsc_sync]check_tsc_sync_source p "TSC synchronization [CPU#%d -> CPU#%d]: passed\012" arch/x86/kernel/setup_percpu.c:120 [setup_percpu]pcpu_alloc_bootmem p "per cpu data for cpu%d %lu bytes on node%d at %016lx\012" arch/x86/kernel/setup_percpu.c:115 [setup_percpu]pcpu_alloc_bootmem p "per cpu data for cpu%d %lu bytes at %016lx\012" arch/x86/kernel/apic/apic.c:1525 [apic]enable_IR_x2apic p "dmar_table_init() failed with %d:\012" arch/x86/kernel/apic/apic.c:1499 [apic]enable_IR p "intr-remapping not supported\012" arch/x86/kernel/apic/io_apic.c:4021 [io_apic]io_apic_set_pci_routing p "Pin %d-%d already programmed\012" arch/x86/kernel/apic/io_apic.c:1622 [io_apic]setup_IO_APIC_irq_extra p "Pin %d-%d already programmed\012" arch/x86/mm/init_64.c:797 [init_64]set_kernel_text_ro p "Set kernel text: %lx - %lx for read only\012" arch/x86/mm/init_64.c:778 [init_64]set_kernel_text_rw p "Set kernel text: %lx - %lx for read write\012" arch/x86/mm/init_64.c:154 [init_64]spp_getpage p "spp_getpage %p\012" arch/x86/mm/init_64.c:218 [init_64]set_pte_vaddr p "set_pte_vaddr %lx to %lx\012" arch/x86/mm/numa.c:41 [numa]setup_node_to_cpumask_map p "Node to cpumask map for %d nodes\012" arch/x86/pci/i386.c:202 [i386]pcibios_allocate_resources p "disabling ROM %pR\012" arch/x86/pci/i386.c:184 [i386]pcibios_allocate_resources p "BAR %d: reserving %pr (d=%d, p=%d)\012" arch/x86/pci/fixup.c:27 [fixup]pci_fixup_i450nx p "i450NX PXB %d: %02x/%02x/%02x\012" arch/x86/pci/fixup.c:84 [fixup]pci_fixup_latency p "Setting max latency to 32\012" arch/x86/pci/irq.c:1036 [irq]pcibios_fixup_irqs p "ignoring bogus IRQ %d\012" arch/x86/pci/irq.c:983 [irq]pcibios_lookup_irq p "can't route interrupt\012" arch/x86/pci/irq.c:959 [irq]pcibios_lookup_irq p "PCI INT %c -> newirq %d" arch/x86/pci/irq.c:917 [irq]pcibios_lookup_irq p "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x" arch/x86/pci/irq.c:913 [irq]pcibios_lookup_irq p "PCI INT %c not routed\012" arch/x86/pci/irq.c:907 [irq]pcibios_lookup_irq p "PCI INT %c not found in routing table\012" arch/x86/pci/irq.c:892 [irq]pcibios_lookup_irq p "no interrupt pin\012" arch/x86/pci/early.c:55 [early]write_pci_config_16 p "%x writing to %x: %x\012" arch/x86/pci/early.c:48 [early]write_pci_config_byte p "%x writing to %x: %x\012" arch/x86/pci/early.c:41 [early]write_pci_config p "%x writing to %x: %x\012" arch/x86/pci/early.c:34 [early]read_pci_config_16 p "%x reading 2 from %x: %x\012" arch/x86/pci/early.c:25 [early]read_pci_config_byte p "%x reading 1 from %x: %x\012" arch/x86/pci/early.c:16 [early]read_pci_config p "%x reading 4 from %x: %x\012" arch/x86/kernel/microcode_core.c:433 [microcode]mc_sysdev_remove p "microcode: CPU%d removed\012" arch/x86/kernel/microcode_core.c:414 [microcode]mc_sysdev_add p "microcode: CPU%d added\012" arch/x86/kernel/microcode_core.c:387 [microcode]microcode_init_cpu p "microcode: CPU%d updated upon init\012" arch/x86/kernel/microcode_core.c:480 [microcode]mc_cpu_callback p "microcode: CPU%d removed\012" arch/x86/kernel/microcode_core.c:472 [microcode]mc_cpu_callback p "microcode: CPU%d added\012" arch/x86/kernel/microcode_core.c:367 [microcode]microcode_resume_cpu p "microcode: CPU%d updated upon resume\012" arch/x86/kernel/microcode_intel.c:438 [microcode]request_microcode_fw p "microcode: data file %s load failed\012" arch/x86/kernel/microcode_intel.c:416 [microcode]generic_load_microcode p "microcode: CPU%d found a matching microcode update with version 0x%x (current=0x%x)\012" arch/x86/kernel/microcode_amd.c:314 [microcode]generic_load_microcode p "microcode: CPU%d found a matching microcode update with version 0x%x (current=0x%x)\012" arch/x86/kvm/x86.c:895 [kvm]kvm_get_time_scale p "%s: base_khz %u => %u, shift %d, mul %u\012" arch/x86/kvm/x86.c:6349 [kvm]kvm_arch_vcpu_ioctl_set_sregs p "Set back pending irq %d\012" arch/x86/kvm/x86.c:5338 [kvm]__vcpu_run p "vcpu %d received sipi with vector # %x\012" arch/x86/kvm/x86.c:4684 [kvm]kvm_timer_init p "kvm: max_tsc_khz = %ld\012" arch/x86/kvm/x86.c:984 [kvm]kvm_write_tsc p "kvm: matched write on unstable tsc\012" arch/x86/kvm/x86.c:981 [kvm]kvm_write_tsc p "kvm: matched tsc offset for %llu\012" arch/x86/kvm/i8254.c:471 [kvm]pit_ioport_write p "pit: write addr is 0x%x, len is %d, val is 0x%x\012" arch/x86/kvm/i8254.c:384 [kvm]pit_load_count p "pit: load_count val is %d, channel is %d\012" arch/x86/kvm/i8254.c:359 [kvm]create_pit_timer p "pit: create pit timer, interval is %llu nsec\012" 2. printk level was set: [root]# cat /proc/sys/kernel/printk 7 4 1 7 3. When we tried using CPU#1 for certain interrupts, it was not getting set, and we did not get any (error/debug) logs in dmesg: [root]# cat /proc/irq/620/smp_affinity_list 0-71 [root]# echo 1 > /proc/irq/620/smp_affinity_list [root]# cat /proc/irq/620/smp_affinity_list 0-71 4. To make sure we were doing this right, we tried taking a CPU offline and bringing it back online and were able to see the log corresponding to the one we enabled using dynamic debug: kvm: disabling virtualization on CPU1 CPU 1 is now offline smpboot:++++++++++++++++++++=_---CPU UP 1 Booting Node 0 Processor 1 APIC 0x2 smpboot:Setting warm reset code and vector. smpboot:1. smpboot:2. smpboot:3. smpboot:Asserting INIT. smpboot:Waiting for send to finish... smpboot:Deasserting INIT. smpboot:Waiting for send to finish... smpboot:#startup loops: 2. smpboot:Sending STARTUP #1. smpboot:After apic_write. smpboot:Startup point 1. smpboot:Waiting for send to finish... smpboot:Sending STARTUP #2. smpboot:After apic_write. smpboot:Startup point 1. smpboot:Waiting for send to finish... smpboot:After Startup. smpboot:Before Callout 1. smpboot:After Callout 1. common:Initializing CPU#1 smpboot:CPU#1 (phys ID: 2) waiting for CALLOUT smpboot:CALLIN, before setup_local_APIC(). smpboot:Stack at about ffff8820535c1f44 smpboot:CPU1: has booted. kvm: enabling virtualization on CPU1 tsc_sync:TSC synchronization [CPU#20 -> CPU#1]: passed microcode: CPU1 sig=0x306f2, pf=0x1, revision=0x2a platform microcode: firmware: requesting intel-ucode/06-3f-02 microcode:microcode: data file intel-ucode/06-3f-02 load failed microcode:microcode: CPU1 added Here's the corresponding log: arch/x86/kernel/cpu/common.c:1251 [common]cpu_init p "Initializing CPU#%d\012" Regards, Mohsin On Thu, Nov 26, 2015 at 9:39 PM, Mohsin Zaidi <mohsinrzaidi at gmail.com> wrote: > Thanks, Neil. Let me see what I can find out. > Regards, > Mohsin > > > On Mon, Nov 23, 2015 at 8:55 AM, Neil Horman <nhorman at tuxdriver.com> wrote: >> On Fri, Nov 20, 2015 at 04:52:31PM -0500, Mohsin Zaidi wrote: >>> I stopped irqbalance and set the affinities of all the interface IRQs >>> to 18 and only 169 of them get set correctly to 18. The rest have the >>> same value they had before (in my test 19,55). >>> Regards, >>> Mohsin >>> >> Ok, thats starting to make more sense. That indicates the issue is occuring in >> the kernel, not irqbalance. Whats most likely happening is either the code that >> parses the cpu mask buffer has a bug that can't handle a mask that big (unlikley >> given this works on other simmilarly sized hardware), or something about the >> irq controllers irq_set_affinity method somehow can't handle the mask that >> you're providing. >> >> The next step would be to enable dynamic debug in the kernel for all files in >> arch/x86/kernel and below, and crank your console printk level up to DEBUG, to >> see if any errors are reported when setting affinities on some of those >> interrupts. >> >> Neil >> >>> >>> On Fri, Nov 20, 2015 at 4:23 PM, Neil Horman <nhorman at tuxdriver.com> wrote: >>> > On Fri, Nov 20, 2015 at 01:45:37PM -0500, Mohsin Zaidi wrote: >>> >> Some more observations. >>> >> >>> >> When I said yesterday that changing the unbanned CPUs to 19/55 or >>> >> 18/54 worked correctly for all IRQs, I failed to notice that of the >>> >> 256 IRQs for the interfaces, 3 would never have their affinities get >>> >> updated correctly. >>> >> >>> >> For example, with the banning mask set to "ff,ff7fffff,fff7ffff", the >>> >> smp_affinity_list values for the last 10 IRQs are as follows: >>> >> >>> >> 19 >>> >> 55 >>> >> 26 >>> >> 55 >>> >> 24 >>> >> 55 >>> >> 19 >>> >> 19 >>> >> 19 >>> >> 22 >>> >> >>> >> 3 of these are set to whatever was set for them last (my last test was >>> >> to unban all CPUs). I see this pattern repeated every time. >>> >> >>> >> I changed the test to unban 18-19,54-55 at the same time, and this >>> >> problem went away. When I unbanned just 19/55 and reduced the number >>> >> of queues per interface by one, the problem also went away. >>> >> >>> >> It's as if 2 CPUs can't be successfully assigned 256 IRQs. This also >>> >> holds true if the CPUs are not siblings (e.g. 19/54). >>> >> >>> > I wonder if this is a hardware limitation (i.e. if you're hitting the upper >>> > limit of the elligible cpu set in an MSI write or some such). >>> > >>> > If you manually set all irqs to a single cpu, what happens? >>> > >>> > Neil >>> > >>> >> So there are two dimensions to the problem. One is choosing CPUs just >>> >> on NUMA node 0 doesn't work, and the other is that assigning 256 IRQs >>> >> to 2 CPUs on NUMA node 1 doesn't work. >>> >> Regards, >>> >> Mohsin >>> >> >>> >> >>> >> On Fri, Nov 20, 2015 at 9:45 AM, Neil Horman <nhorman at tuxdriver.com> wrote: >>> >> > On Thu, Nov 19, 2015 at 01:32:58PM -0500, Mohsin Zaidi wrote: >>> >> >> Thanks, Neil. I'll have the results for you shortly. >>> >> >> >>> >> >> I wanted to point out that each of the 4 interfaces on the server have >>> >> >> 64 queues, so there are a total of 256 queues. Also, the banning is >>> >> >> attempting to direct interrupts to just two processors (#1 and #37) on >>> >> >> the same NUMA node, which is also not the same as the NUMA node that >>> >> >> "owns" the interface I am looking at (eth03). >>> >> >> >>> >> >> Does any of this matter? >>> >> > It really shouldn't, but given that I'm at a loss to explain the behavior yet, >>> >> > anything is on the table. >>> >> > Neil >>> >> > >>> >> >> Regards, >>> >> >> Mohsin >>> >> >> >>> >> >> >>> >> >> On Thu, Nov 19, 2015 at 9:58 AM, Neil Horman <nhorman at tuxdriver.com> wrote: >>> >> >> > On Wed, Nov 18, 2015 at 10:42:41AM -0500, Mohsin Zaidi wrote: >>> >> >> >> I'm using the irqbalance daemon with the following config file. The >>> >> >> >> only thing I've changed is the banned CPUs list, and I've banned all >>> >> >> >> but CPUs #1 and #37. Interrupts *never* go to #1, and go to #18 and >>> >> >> >> #37, even though #18 has also been banned. >>> >> >> >> >>> >> >> >> # irqbalance is a daemon process that distributes interrupts across >>> >> >> >> # CPUS on SMP systems. The default is to rebalance once every 10 >>> >> >> >> # seconds. This is the environment file that is specified to systemd via the >>> >> >> >> # EnvironmentFile key in the service unit file (or via whatever method the init >>> >> >> >> # system you're using has. >>> >> >> >> # >>> >> >> >> # ONESHOT=yes >>> >> >> >> # after starting, wait for a minute, then look at the interrupt >>> >> >> >> # load and balance it once; after balancing exit and do not change >>> >> >> >> # it again. >>> >> >> >> #IRQBALANCE_ONESHOT= >>> >> >> >> >>> >> >> >> # >>> >> >> >> # IRQBALANCE_BANNED_CPUS >>> >> >> >> # 64 bit bitmask which allows you to indicate which cpu's should >>> >> >> >> # be skipped when reblancing irqs. Cpu numbers which have their >>> >> >> >> # corresponding bits set to one in this mask will not have any >>> >> >> >> # irq's assigned to them on rebalance >>> >> >> >> # >>> >> >> >> #IRQBALANCE_BANNED_CPUS= >>> >> >> >> IRQBALANCE_BANNED_CPUS=000000ff,ffffffdf,fffffffd >>> >> >> >> >>> >> >> >> # >>> >> >> >> # IRQBALANCE_ARGS >>> >> >> >> # append any args here to the irqbalance daemon as documented in the man page >>> >> >> >> # >>> >> >> >> #IRQBALANCE_ARGS= >>> >> >> >> Regards, >>> >> >> >> Mohsin >>> >> >> >> >>> >> >> >> >>> >> >> >> On Wed, Nov 18, 2015 at 10:28 AM, Neil Horman <nhorman at tuxdriver.com> wrote: >>> >> >> >> > On Wed, Nov 18, 2015 at 10:04:56AM -0500, Mohsin Zaidi wrote: >>> >> >> >> >> Sorry about that, Neil. >>> >> >> >> >> >>> >> >> >> >> I haven't specified any hint policy in IRQBALANCE_ARGS (for the daemon). >>> >> >> >> >> Regards, >>> >> >> >> >> Mohsin >>> >> >> >> >> >>> >> >> >> > Ok, well, I'm at a bit of a loss. irqbalance, based on your output from the >>> >> >> >> > debug log, is working properly, presuming you actually listed cpus 18 and 37 as >>> >> >> >> > your only unbanned one, which you indicate is the opposite of what you've >>> >> >> >> > configured. >>> >> >> >> > >>> >> >> >> > Can you please send me the command line you use to start irqbalance? >>> >> >> >> > >>> >> >> >> > Neil >>> >> >> >> > >>> >> >> >> >> >>> >> >> >> >> On Wed, Nov 18, 2015 at 6:36 AM, Neil Horman <nhorman at tuxdriver.com> wrote: >>> >> >> >> >> > On Fri, Nov 13, 2015 at 04:39:08PM -0500, Neil Horman wrote: >>> >> >> >> >> >> On Fri, Nov 13, 2015 at 01:39:20PM -0500, Mohsin Zaidi wrote: >>> >> >> >> >> >> > Thanks for your reply, Neil. >>> >> >> >> >> >> > >>> >> >> >> >> >> > Yes, when I manually set the irq affinity to avoid #18, it works. >>> >> >> >> >> >> > >>> >> >> >> >> >> > I just downloaded and applied the latest irqbalance code, but it's >>> >> >> >> >> >> > showing the same behavior. >>> >> >> >> >> >> > >>> >> >> >> >> >> What hint policy are you using? >>> >> >> >> >> >> >>> >> >> >> >> >> Neil >>> >> >> >> >> >> >>> >> >> >> >> > Ping, any response regarding hint policy? >>> >> >> >> >> > >>> >> >> >> >> > Neil >>> >> >> >> >> > >>> >> >> >> >> >>> >> >> >> >>> >> >> > >>> >> >> > I'm at something of a loss here. I can see no reason why this would fail on >>> >> >> > only one system. In an effort to get additional data, please apply this patch, >>> >> >> > run irqbalance in debug mode and post the output please. >>> >> >> > >>> >> >> > Thanks! >>> >> >> > Neil >>> >> >> > >>> >> >> > >>> >> >> > diff --git a/activate.c b/activate.c >>> >> >> > index c8453d5..d92e770 100644 >>> >> >> > --- a/activate.c >>> >> >> > +++ b/activate.c >>> >> >> > @@ -113,6 +113,7 @@ static void activate_mapping(struct irq_info *info, void *data __attribute__((un >>> >> >> > return; >>> >> >> > >>> >> >> > cpumask_scnprintf(buf, PATH_MAX, applied_mask); >>> >> >> > + printf("Applying mask for irq %d: 5s\n", info->irq, buf); >>> >> >> > fprintf(file, "%s", buf); >>> >> >> > fclose(file); >>> >> >> > info->moved = 0; /*migration is done*/ >>> >> >> >>> >> >>>