From: Arnd Bergmann <arnd@xxxxxxxx> commit 1139aeb1c521eb4a050920ce6c64c36c4f2a3ab7 upstream. As of commit 966a967116e6 ("smp: Avoid using two cache lines for struct call_single_data"), the smp code prefers 32-byte aligned call_single_data objects for performance reasons, but the block layer includes an instance of this structure in the main 'struct request' that is more senstive to size than to performance here, see 4ccafe032005 ("block: unalign call_single_data in struct request"). The result is a violation of the calling conventions that clang correctly points out: block/blk-mq.c:630:39: warning: passing 8-byte aligned argument to 32-byte aligned parameter 2 of 'smp_call_function_single_async' may result in an unaligned pointer access [-Walign-mismatch] smp_call_function_single_async(cpu, &rq->csd); It does seem that the usage of the call_single_data without cache line alignment should still be allowed by the smp code, so just change the function prototype so it accepts both, but leave the default alignment unchanged for the other users. This seems better to me than adding a local hack to shut up an otherwise correct warning in the caller. Signed-off-by: Arnd Bergmann <arnd@xxxxxxxx> Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx> Acked-by: Jens Axboe <axboe@xxxxxxxxx> Link: https://lkml.kernel.org/r/20210505211300.3174456-1-arnd@xxxxxxxxxx [nc: Fix conflicts, modify rq_csd_init] Signed-off-by: Nathan Chancellor <nathan@xxxxxxxxxx> Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx> --- include/linux/smp.h | 2 +- kernel/sched/core.c | 2 +- kernel/smp.c | 20 ++++++++++---------- kernel/up.c | 2 +- 4 files changed, 13 insertions(+), 13 deletions(-) --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -74,7 +74,7 @@ void on_each_cpu_cond(smp_cond_func_t co void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func, void *info, bool wait, const struct cpumask *mask); -int smp_call_function_single_async(int cpu, call_single_data_t *csd); +int smp_call_function_single_async(int cpu, struct __call_single_data *csd); #ifdef CONFIG_SMP --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -321,7 +321,7 @@ void update_rq_clock(struct rq *rq) } static inline void -rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func) +rq_csd_init(struct rq *rq, struct __call_single_data *csd, smp_call_func_t func) { csd->flags = 0; csd->func = func; --- a/kernel/smp.c +++ b/kernel/smp.c @@ -110,7 +110,7 @@ static DEFINE_PER_CPU(void *, cur_csd_in static atomic_t csd_bug_count = ATOMIC_INIT(0); /* Record current CSD work for current CPU, NULL to erase. */ -static void csd_lock_record(call_single_data_t *csd) +static void csd_lock_record(struct __call_single_data *csd) { if (!csd) { smp_mb(); /* NULL cur_csd after unlock. */ @@ -125,7 +125,7 @@ static void csd_lock_record(call_single_ /* Or before unlock, as the case may be. */ } -static __always_inline int csd_lock_wait_getcpu(call_single_data_t *csd) +static __always_inline int csd_lock_wait_getcpu(struct __call_single_data *csd) { unsigned int csd_type; @@ -140,7 +140,7 @@ static __always_inline int csd_lock_wait * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, * so waiting on other types gets much less information. */ -static __always_inline bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id) +static __always_inline bool csd_lock_wait_toolong(struct __call_single_data *csd, u64 ts0, u64 *ts1, int *bug_id) { int cpu = -1; int cpux; @@ -204,7 +204,7 @@ static __always_inline bool csd_lock_wai * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ -static __always_inline void csd_lock_wait(call_single_data_t *csd) +static __always_inline void csd_lock_wait(struct __call_single_data *csd) { int bug_id = 0; u64 ts0, ts1; @@ -219,17 +219,17 @@ static __always_inline void csd_lock_wai } #else -static void csd_lock_record(call_single_data_t *csd) +static void csd_lock_record(struct __call_single_data *csd) { } -static __always_inline void csd_lock_wait(call_single_data_t *csd) +static __always_inline void csd_lock_wait(struct __call_single_data *csd) { smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK)); } #endif -static __always_inline void csd_lock(call_single_data_t *csd) +static __always_inline void csd_lock(struct __call_single_data *csd) { csd_lock_wait(csd); csd->flags |= CSD_FLAG_LOCK; @@ -242,7 +242,7 @@ static __always_inline void csd_lock(cal smp_wmb(); } -static __always_inline void csd_unlock(call_single_data_t *csd) +static __always_inline void csd_unlock(struct __call_single_data *csd) { WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); @@ -276,7 +276,7 @@ void __smp_call_single_queue(int cpu, st * for execution on the given CPU. data must already have * ->func, ->info, and ->flags set. */ -static int generic_exec_single(int cpu, call_single_data_t *csd) +static int generic_exec_single(int cpu, struct __call_single_data *csd) { if (cpu == smp_processor_id()) { smp_call_func_t func = csd->func; @@ -542,7 +542,7 @@ EXPORT_SYMBOL(smp_call_function_single); * NOTE: Be careful, there is unfortunately no current debugging facility to * validate the correctness of this serialization. */ -int smp_call_function_single_async(int cpu, call_single_data_t *csd) +int smp_call_function_single_async(int cpu, struct __call_single_data *csd) { int err = 0; --- a/kernel/up.c +++ b/kernel/up.c @@ -25,7 +25,7 @@ int smp_call_function_single(int cpu, vo } EXPORT_SYMBOL(smp_call_function_single); -int smp_call_function_single_async(int cpu, call_single_data_t *csd) +int smp_call_function_single_async(int cpu, struct __call_single_data *csd) { unsigned long flags;