hi, Jacob Pan, On Tue, Apr 23, 2024 at 10:41:05AM -0700, Jacob Pan wrote: > Mixture of bitfields and types is weird and really not intuitive, remove > bitfields and use typed data exclusively. Bitfields often result in > inferior machine code. > > Link: https://lore.kernel.org/all/20240404101735.402feec8@jacob-builder/T/#mf66e34a82a48f4d8e2926b5581eff59a122de53a > Suggested-by: Sean Christopherson <seanjc@xxxxxxxxxx> > Suggested-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx> > Signed-off-by: Jacob Pan <jacob.jun.pan@xxxxxxxxxxxxxxx> > > --- > v3: > - Fix a bug where SN bit position was used as the mask, reported by > Oliver Sang. we tested this verion, and confirmed the issue gone Tested-by: kernel test robot <oliver.sang@xxxxxxxxx> below just for reference. for previous version, we noticed a "WARNING:at_arch/x86/kvm/vmx/posted_intr.c:#pi_enable_wakeup_handler[kvm_intel]" in testcase: kernel-selftests Call Trace like below: [ 399.225452][ T8098] ------------[ cut here ]------------ [ 399.232475][ T8098] PI descriptor SN field set before blocking [ 399.232514][ T8098] WARNING: CPU: 184 PID: 8098 at arch/x86/kvm/vmx/posted_intr.c:160 pi_enable_wakeup_handler+0x421/0x5f0 [kvm_intel] [ 399.254685][ T8098] Modules linked in: openvswitch nf_conncount nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 intel_rapl_msr intel_rapl_common btrfs +blake2b_generic x86_pkg_temp_thermal xor intel_powerclamp zstd_compress coretemp raid6_pq libcrc32c kvm_intel kvm nvme crct10dif_pclmul crc32_pclmul +nvme_core crc32c_intel t10_pi ghash_clmulni_intel ast sha512_ssse3 rapl intel_cstate dax_hmem crc64_rocksoft_generic drm_shmem_helper mei_me i2c_i801 +crc64_rocksoft mei drm_kms_helper i2c_smbus crc64 i2c_ismt wmi ipmi_ssif acpi_ipmi ipmi_si ipmi_devintf ipmi_msghandler acpi_pad acpi_power_meter joydev +binfmt_misc loop fuse drm dm_mod ip_tables [ 399.321529][ T8098] CPU: 184 PID: 8098 Comm: xapic_ipi_test Not tainted 6.9.0-rc1-00008-g037ccaed5dc5 #1 [ 399.333325][ T8098] RIP: 0010:pi_enable_wakeup_handler+0x421/0x5f0 [kvm_intel] [ 399.342631][ T8098] Code: e8 a4 5a 7b c0 e9 b2 fc ff ff e8 5a 5c 7b c0 fb eb 93 bf f1 00 00 00 e8 dd 9c 29 c0 eb 82 48 c7 c7 e0 c6 ee c0 e8 0f 2f 37 c0 +<0f> 0b e9 e7 fe ff ff 4c 89 f7 e8 00 6a c8 c0 e9 cd fe ff ff 4c 89 [ 399.365742][ T8098] RSP: 0018:ffa000003a527780 EFLAGS: 00010086 [ 399.373626][ T8098] RAX: 0000000000000000 RBX: ff1100247c23c740 RCX: 0000000000000027 [ 399.383668][ T8098] RDX: 0000000000000027 RSI: 0000000000000004 RDI: ff11003fc3030c08 [ 399.393759][ T8098] RBP: ffa000003a527908 R08: 0000000000000001 R09: ffe21c07f8606181 [ 399.403834][ T8098] R10: ff11003fc3030c0b R11: 0000000000000000 R12: 1ff40000074a4ef4 [ 399.413901][ T8098] R13: 0000000000000000 R14: ff1100247c23e4a0 R15: 00000000000000b8 [ 399.423969][ T8098] FS: 00007f10de2006c0(0000) GS:ff11003fc3000000(0000) knlGS:0000000000000000 [ 399.435126][ T8098] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 399.443651][ T8098] CR2: 0000000000000000 CR3: 000000210b958004 CR4: 0000000000f73ef0 [ 399.453753][ T8098] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 399.463835][ T8098] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 [ 399.473912][ T8098] PKRU: 55555554 [ 399.479005][ T8098] Call Trace: [ 399.483812][ T8098] <TASK> [ 399.488224][ T8098] ? __warn+0xcc/0x2d0 [ 399.493903][ T8098] ? pi_enable_wakeup_handler+0x421/0x5f0 [kvm_intel] [ 399.502643][ T8098] ? report_bug+0x261/0x2c0 [ 399.508825][ T8098] ? handle_bug+0x3a/0x90 [ 399.514817][ T8098] ? exc_invalid_op+0x17/0x40 [ 399.521191][ T8098] ? asm_exc_invalid_op+0x1a/0x20 [ 399.527938][ T8098] ? pi_enable_wakeup_handler+0x421/0x5f0 [kvm_intel] [ 399.536677][ T8098] ? __pfx_pi_enable_wakeup_handler+0x10/0x10 [kvm_intel] [ 399.545747][ T8098] ? __pfx_lock_repin_lock+0x10/0x10 [ 399.552740][ T8098] ? newidle_balance+0xc85/0x1300 [ 399.559408][ T8098] ? __pfx_perf_event_context_sched_out+0x10/0x10 [ 399.567652][ T8098] ? lock_acquire+0x432/0x4e0 [ 399.573959][ T8098] ? vmx_get_rflags+0x26/0x2c0 [kvm_intel] [ 399.581527][ T8098] vmx_vcpu_pi_put+0x1d3/0x230 [kvm_intel] [ 399.589119][ T8098] vmx_vcpu_put+0x12/0x20 [kvm_intel] [ 399.596205][ T8098] kvm_arch_vcpu_put+0x49e/0x7b0 [kvm] [ 399.603463][ T8098] kvm_sched_out+0xb2/0xe0 [kvm] [ 399.610100][ T8098] prepare_task_switch+0x321/0xc40 [ 399.616863][ T8098] ? lock_release+0x1bf/0x240 [ 399.623146][ T8098] __schedule+0x5a6/0x20b0 [ 399.629125][ T8098] ? __pfx___schedule+0x10/0x10 [ 399.635581][ T8098] ? __pfx_lock_acquire+0x10/0x10 [ 399.642228][ T8098] ? kvm_apic_has_interrupt+0x9c/0x160 [kvm] [ 399.650034][ T8098] ? lock_acquire+0x432/0x4e0 [ 399.656267][ T8098] ? __pfx_lock_acquire+0x10/0x10 [ 399.662905][ T8098] schedule+0xe2/0x2a0 [ 399.668478][ T8098] kvm_vcpu_block+0xd1/0x1c0 [kvm] [ 399.675308][ T8098] kvm_vcpu_halt+0xee/0x900 [kvm] [ 399.682018][ T8098] vcpu_run+0x50a/0x9d0 [kvm] [ 399.688346][ T8098] kvm_arch_vcpu_ioctl_run+0x377/0x1430 [kvm] [ 399.696254][ T8098] ? lock_release+0xe5/0x240 [ 399.702428][ T8098] kvm_vcpu_ioctl+0x34c/0xc40 [kvm] [ 399.709341][ T8098] ? __pfx_kvm_vcpu_ioctl+0x10/0x10 [kvm] [ 399.716821][ T8098] ? __lock_release+0x103/0x440 [ 399.723960][ T8098] ? __fget_files+0x1c7/0x330 [ 399.730219][ T8098] ? __pfx___lock_release+0x10/0x10 [ 399.737750][ T8098] ? __fget_files+0x1cc/0x330 [ 399.744028][ T8098] ? __fget_files+0x1c7/0x330 [ 399.750289][ T8098] ? lock_release+0xe5/0x240 [ 399.756461][ T8098] ? __fget_files+0x1cc/0x330 [ 399.762719][ T8098] __x64_sys_ioctl+0x134/0x1b0 [ 399.769077][ T8098] do_syscall_64+0x93/0x170 [ 399.775146][ T8098] ? do_user_addr_fault+0x477/0xcb0 [ 399.781989][ T8098] ? lockdep_hardirqs_on_prepare+0x279/0x3e0 [ 399.789654][ T8098] entry_SYSCALL_64_after_hwframe+0x6c/0x74 [ 399.797192][ T8098] RIP: 0033:0x7f10de88bc5b [ 399.803079][ T8098] Code: 00 48 89 44 24 18 31 c0 48 8d 44 24 60 c7 04 24 10 00 00 00 48 89 44 24 08 48 8d 44 24 20 48 89 44 24 10 b8 10 00 00 00 0f 05 +<89> c2 3d 00 f0 ff ff 77 1c 48 8b 44 24 18 64 48 2b 04 25 28 00 00 [ 399.826105][ T8098] RSP: 002b:00007f10de1ff9c0 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 399.836565][ T8098] RAX: ffffffffffffffda RBX: 000000001a5418c0 RCX: 00007f10de88bc5b [ 399.846521][ T8098] R > - Add and use non-atomic helpers to manipulate SN bit > - Use pi_test_sn() instead of open coding > v2: > - Replace bitfields, no more mix. > --- > arch/x86/include/asm/posted_intr.h | 21 ++++++++++++--------- > arch/x86/kvm/vmx/posted_intr.c | 4 ++-- > arch/x86/kvm/vmx/vmx.c | 2 +- > 3 files changed, 15 insertions(+), 12 deletions(-) > > diff --git a/arch/x86/include/asm/posted_intr.h b/arch/x86/include/asm/posted_intr.h > index acf237b2882e..20e31891de15 100644 > --- a/arch/x86/include/asm/posted_intr.h > +++ b/arch/x86/include/asm/posted_intr.h > @@ -15,17 +15,9 @@ struct pi_desc { > }; > union { > struct { > - /* bit 256 - Outstanding Notification */ > - u16 on : 1, > - /* bit 257 - Suppress Notification */ > - sn : 1, > - /* bit 271:258 - Reserved */ > - rsvd_1 : 14; > - /* bit 279:272 - Notification Vector */ > + u16 notifications; /* Suppress and outstanding bits */ > u8 nv; > - /* bit 287:280 - Reserved */ > u8 rsvd_2; > - /* bit 319:288 - Notification Destination */ > u32 ndst; > }; > u64 control; > @@ -88,4 +80,15 @@ static inline bool pi_test_sn(struct pi_desc *pi_desc) > return test_bit(POSTED_INTR_SN, (unsigned long *)&pi_desc->control); > } > > +/* Non-atomic helpers */ > +static inline void __pi_set_sn(struct pi_desc *pi_desc) > +{ > + pi_desc->notifications |= BIT(POSTED_INTR_SN); > +} > + > +static inline void __pi_clear_sn(struct pi_desc *pi_desc) > +{ > + pi_desc->notifications &= ~BIT(POSTED_INTR_SN); > +} > + > #endif /* _X86_POSTED_INTR_H */ > diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c > index af662312fd07..ec08fa3caf43 100644 > --- a/arch/x86/kvm/vmx/posted_intr.c > +++ b/arch/x86/kvm/vmx/posted_intr.c > @@ -107,7 +107,7 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) > * handle task migration (@cpu != vcpu->cpu). > */ > new.ndst = dest; > - new.sn = 0; > + __pi_clear_sn(&new); > > /* > * Restore the notification vector; in the blocking case, the > @@ -157,7 +157,7 @@ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) > &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); > raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); > > - WARN(pi_desc->sn, "PI descriptor SN field set before blocking"); > + WARN(pi_test_sn(pi_desc), "PI descriptor SN field set before blocking"); > > old.control = READ_ONCE(pi_desc->control); > do { > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c > index d94bb069bac9..f505745913c8 100644 > --- a/arch/x86/kvm/vmx/vmx.c > +++ b/arch/x86/kvm/vmx/vmx.c > @@ -4843,7 +4843,7 @@ static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu) > * or POSTED_INTR_WAKEUP_VECTOR. > */ > vmx->pi_desc.nv = POSTED_INTR_VECTOR; > - vmx->pi_desc.sn = 1; > + __pi_set_sn(&vmx->pi_desc); > } > > static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) > -- > 2.25.1 >