On Wed, May 11, 2022 at 05:27:46AM +0300, Kirill A. Shutemov wrote: > +#define __untagged_addr(addr, n) \ > + ((__force __typeof__(addr))sign_extend64((__force u64)(addr), n)) > + > +#define untagged_addr(addr) ({ \ > + u64 __addr = (__force u64)(addr); \ > + if (__addr >> 63 == 0) { \ > + if (current->thread.features & X86_THREAD_LAM_U57) \ > + __addr &= __untagged_addr(__addr, 56); \ > + else if (current->thread.features & X86_THREAD_LAM_U48) \ > + __addr &= __untagged_addr(__addr, 47); \ > + } \ > + (__force __typeof__(addr))__addr; \ > +}) Assuming you got your bits in hardware order: u64 __addr = addr; if ((s64)__addr >= 0) { int lam = (current->thread.features >> X86_THREAD_LAM_U57) & 3; if (lam) __addr &= sign_extend64(__addr, 65 - 9*lam); } __addr; has less branches on and should definitely result in better code (or I need more morning juice). > + > +#define untagged_ptr(ptr) ({ \ > + u64 __ptrval = (__force u64)(ptr); \ > + __ptrval = untagged_addr(__ptrval); \ > + (__force __typeof__(*(ptr)) *)__ptrval; \ > +}) > #endif /* !__ASSEMBLY__ */ > > #ifdef CONFIG_X86_VSYSCALL_EMULATION > -- > 2.35.1 >