Very nice. The performance change can not be explained, but nice cleanup. Reviewed by me On Tue, 6 Sept 2022 at 11:31, Enzo Matsumiya <ematsumiya@xxxxxxx> wrote: > > Replace hardcoded alignment computations (e.g. (len + 7) & ~0x7) of > (mostly) SMB2 messages length by ALIGN()/IS_ALIGNED() macros (right > tools for the job, less prone to errors). > > But also replace (DIV_ROUND_UP(len, 8) * 8) with ALIGN(len, 8), which, > if not optimized by the compiler, has the overhead of a multiplication > and a division. Do the same for roundup() by replacing it by round_up() > (division-less version, but requires the multiple to be a power of 2, > which is always the case for us). > > Also remove some unecessary checks where !IS_ALIGNED() would fit, but > calling round_up() directly is just fine as it'll be a no-op if the > value is already aligned. > > Afraid this was a useless micro-optimization, I ran some tests with a > very light workload, and I observed a ~50% perf improvement already: > > Record all cifs.ko functions: > # trace-cmd record --module cifs -p function_graph > > (test-dir has ~50MB with ~4000 files) > Test commands after share is mounted and with no activity: > # cp -r test-dir /mnt/test > # sync > # umount /mnt/test > > Number of traced functions: > # trace-cmd report -l | awk '{ print $6 }' | grep "^[0-9].*" | wc -l > - unpatched: 307746 > - patched: 313199 > > Measuring the average latency of all traced functions: > # trace-cmd report -l | awk '{ print $6 }' | grep "^[0-9].*" | jq -s add/length > - unpatched: 27105.577791262822 us > - patched: 14548.665733635338 us > > So even though the patched version made 5k+ more function calls (for > whatever reason), it still did it with ~50% reduced latency. > > On a larger scale, given the affected code paths, this patch should > show a relevant improvement. > > Signed-off-by: Enzo Matsumiya <ematsumiya@xxxxxxx> > --- > Please let me know if my measurements are enough/valid. > > fs/cifs/cifssmb.c | 7 +++--- > fs/cifs/connect.c | 11 ++++++-- > fs/cifs/sess.c | 18 +++++-------- > fs/cifs/smb2inode.c | 4 +-- > fs/cifs/smb2misc.c | 2 +- > fs/cifs/smb2pdu.c | 61 +++++++++++++++++++-------------------------- > 6 files changed, 47 insertions(+), 56 deletions(-) > > diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c > index 7aa91e272027..addf3fc62aef 100644 > --- a/fs/cifs/cifssmb.c > +++ b/fs/cifs/cifssmb.c > @@ -2305,7 +2305,7 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon, > remap); > } > rename_info->target_name_len = cpu_to_le32(2 * len_of_str); > - count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str); > + count = sizeof(struct set_file_rename) + (2 * len_of_str); > byte_count += count; > pSMB->DataCount = cpu_to_le16(count); > pSMB->TotalDataCount = pSMB->DataCount; > @@ -2796,7 +2796,7 @@ CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, > cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n"); > goto qreparse_out; > } > - end_of_smb = 2 + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount; > + end_of_smb = sizeof(__le16) + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount; > reparse_buf = (struct reparse_symlink_data *) > ((char *)&pSMBr->hdr.Protocol + data_offset); > if ((char *)reparse_buf >= end_of_smb) { > @@ -3350,8 +3350,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata, > pSMBr = (struct smb_com_ntransact_rsp *)buf; > > bcc = get_bcc(&pSMBr->hdr); > - end_of_smb = 2 /* sizeof byte count */ + bcc + > - (char *)&pSMBr->ByteCount; > + end_of_smb = sizeof(__le16) + bcc + (char *)&pSMBr->ByteCount; > > data_offset = le32_to_cpu(pSMBr->DataOffset); > data_count = le32_to_cpu(pSMBr->DataCount); > diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c > index a0a06b6f252b..389127e21563 100644 > --- a/fs/cifs/connect.c > +++ b/fs/cifs/connect.c > @@ -2833,9 +2833,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) > * sessinit is sent but no second negprot > */ > struct rfc1002_session_packet *ses_init_buf; > + unsigned int req_noscope_len; > struct smb_hdr *smb_buf; > + > ses_init_buf = kzalloc(sizeof(struct rfc1002_session_packet), > GFP_KERNEL); > + > if (ses_init_buf) { > ses_init_buf->trailer.session_req.called_len = 32; > > @@ -2871,8 +2874,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) > ses_init_buf->trailer.session_req.scope2 = 0; > smb_buf = (struct smb_hdr *)ses_init_buf; > > - /* sizeof RFC1002_SESSION_REQUEST with no scope */ > - smb_buf->smb_buf_length = cpu_to_be32(0x81000044); > + /* sizeof RFC1002_SESSION_REQUEST with no scopes */ > + req_noscope_len = sizeof(struct rfc1002_session_packet) - 2; > + > + /* == cpu_to_be32(0x81000044) */ > + smb_buf->smb_buf_length = > + cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | req_noscope_len); > rc = smb_send(server, smb_buf, 0x44); > kfree(ses_init_buf); > /* > diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c > index 3af3b05b6c74..951874928d70 100644 > --- a/fs/cifs/sess.c > +++ b/fs/cifs/sess.c > @@ -601,11 +601,6 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, > /* BB FIXME add check that strings total less > than 335 or will need to send them as arrays */ > > - /* unicode strings, must be word aligned before the call */ > -/* if ((long) bcc_ptr % 2) { > - *bcc_ptr = 0; > - bcc_ptr++; > - } */ > /* copy user */ > if (ses->user_name == NULL) { > /* null user mount */ > @@ -1318,7 +1313,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data) > } > > if (ses->capabilities & CAP_UNICODE) { > - if (sess_data->iov[0].iov_len % 2) { > + if (!IS_ALIGNED(sess_data->iov[0].iov_len, 2)) { > *bcc_ptr = 0; > bcc_ptr++; > } > @@ -1358,7 +1353,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data) > /* no string area to decode, do nothing */ > } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { > /* unicode string area must be word-aligned */ > - if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { > + if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) { > ++bcc_ptr; > --bytes_remaining; > } > @@ -1442,8 +1437,7 @@ sess_auth_kerberos(struct sess_data *sess_data) > > if (ses->capabilities & CAP_UNICODE) { > /* unicode strings must be word aligned */ > - if ((sess_data->iov[0].iov_len > - + sess_data->iov[1].iov_len) % 2) { > + if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) { > *bcc_ptr = 0; > bcc_ptr++; > } > @@ -1494,7 +1488,7 @@ sess_auth_kerberos(struct sess_data *sess_data) > /* no string area to decode, do nothing */ > } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { > /* unicode string area must be word-aligned */ > - if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { > + if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) { > ++bcc_ptr; > --bytes_remaining; > } > @@ -1546,7 +1540,7 @@ _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data) > > bcc_ptr = sess_data->iov[2].iov_base; > /* unicode strings must be word aligned */ > - if ((sess_data->iov[0].iov_len + sess_data->iov[1].iov_len) % 2) { > + if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) { > *bcc_ptr = 0; > bcc_ptr++; > } > @@ -1747,7 +1741,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data) > /* no string area to decode, do nothing */ > } else if (smb_buf->Flags2 & SMBFLG2_UNICODE) { > /* unicode string area must be word-aligned */ > - if (((unsigned long) bcc_ptr - (unsigned long) smb_buf) % 2) { > + if (!IS_ALIGNED((unsigned long)bcc_ptr - (unsigned long)smb_buf, 2)) { > ++bcc_ptr; > --bytes_remaining; > } > diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c > index b83f59051b26..4eefbe574b82 100644 > --- a/fs/cifs/smb2inode.c > +++ b/fs/cifs/smb2inode.c > @@ -207,7 +207,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, > rqst[num_rqst].rq_iov = &vars->si_iov[0]; > rqst[num_rqst].rq_nvec = 1; > > - size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */ > + size[0] = sizeof(u8); /* See MS-FSCC section 2.4.11 */ > data[0] = &delete_pending[0]; > > rc = SMB2_set_info_init(tcon, server, > @@ -225,7 +225,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, > rqst[num_rqst].rq_iov = &vars->si_iov[0]; > rqst[num_rqst].rq_nvec = 1; > > - size[0] = 8; /* sizeof __le64 */ > + size[0] = sizeof(__le64); > data[0] = ptr; > > rc = SMB2_set_info_init(tcon, server, > diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c > index d73e5672aac4..258b01306d85 100644 > --- a/fs/cifs/smb2misc.c > +++ b/fs/cifs/smb2misc.c > @@ -248,7 +248,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server) > * Some windows servers (win2016) will pad also the final > * PDU in a compound to 8 bytes. > */ > - if (((calc_len + 7) & ~7) == len) > + if (ALIGN(calc_len, 8) == len) > return 0; > > /* > diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c > index 6352ab32c7e7..5da0b596c8a0 100644 > --- a/fs/cifs/smb2pdu.c > +++ b/fs/cifs/smb2pdu.c > @@ -466,15 +466,14 @@ build_signing_ctxt(struct smb2_signing_capabilities *pneg_ctxt) > /* > * Context Data length must be rounded to multiple of 8 for some servers > */ > - pneg_ctxt->DataLength = cpu_to_le16(DIV_ROUND_UP( > - sizeof(struct smb2_signing_capabilities) - > - sizeof(struct smb2_neg_context) + > - (num_algs * 2 /* sizeof u16 */), 8) * 8); > + pneg_ctxt->DataLength = cpu_to_le16(ALIGN(sizeof(struct smb2_signing_capabilities) - > + sizeof(struct smb2_neg_context) + > + (num_algs * sizeof(u16)), 8)); > pneg_ctxt->SigningAlgorithmCount = cpu_to_le16(num_algs); > pneg_ctxt->SigningAlgorithms[0] = cpu_to_le16(SIGNING_ALG_AES_CMAC); > > - ctxt_len += 2 /* sizeof le16 */ * num_algs; > - ctxt_len = DIV_ROUND_UP(ctxt_len, 8) * 8; > + ctxt_len += sizeof(__le16) * num_algs; > + ctxt_len = ALIGN(ctxt_len, 8); > return ctxt_len; > /* TBD add SIGNING_ALG_AES_GMAC and/or SIGNING_ALG_HMAC_SHA256 */ > } > @@ -511,8 +510,7 @@ build_netname_ctxt(struct smb2_netname_neg_context *pneg_ctxt, char *hostname) > /* copy up to max of first 100 bytes of server name to NetName field */ > pneg_ctxt->DataLength = cpu_to_le16(2 * cifs_strtoUTF16(pneg_ctxt->NetName, hostname, 100, cp)); > /* context size is DataLength + minimal smb2_neg_context */ > - return DIV_ROUND_UP(le16_to_cpu(pneg_ctxt->DataLength) + > - sizeof(struct smb2_neg_context), 8) * 8; > + return ALIGN(le16_to_cpu(pneg_ctxt->DataLength) + sizeof(struct smb2_neg_context), 8); > } > > static void > @@ -557,18 +555,18 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, > * round up total_len of fixed part of SMB3 negotiate request to 8 > * byte boundary before adding negotiate contexts > */ > - *total_len = roundup(*total_len, 8); > + *total_len = ALIGN(*total_len, 8); > > pneg_ctxt = (*total_len) + (char *)req; > req->NegotiateContextOffset = cpu_to_le32(*total_len); > > build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt); > - ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_preauth_neg_context), 8) * 8; > + ctxt_len = ALIGN(sizeof(struct smb2_preauth_neg_context), 8); > *total_len += ctxt_len; > pneg_ctxt += ctxt_len; > > build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt); > - ctxt_len = DIV_ROUND_UP(sizeof(struct smb2_encryption_neg_context), 8) * 8; > + ctxt_len = ALIGN(sizeof(struct smb2_encryption_neg_context), 8); > *total_len += ctxt_len; > pneg_ctxt += ctxt_len; > > @@ -595,9 +593,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req, > if (server->compress_algorithm) { > build_compression_ctxt((struct smb2_compression_capabilities_context *) > pneg_ctxt); > - ctxt_len = DIV_ROUND_UP( > - sizeof(struct smb2_compression_capabilities_context), > - 8) * 8; > + ctxt_len = ALIGN(sizeof(struct smb2_compression_capabilities_context), 8); > *total_len += ctxt_len; > pneg_ctxt += ctxt_len; > neg_context_count++; > @@ -780,7 +776,7 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp, > if (rc) > break; > /* offsets must be 8 byte aligned */ > - clen = (clen + 7) & ~0x7; > + clen = ALIGN(clen, 8); > offset += clen + sizeof(struct smb2_neg_context); > len_of_ctxts -= clen; > } > @@ -2413,7 +2409,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) > unsigned int group_offset = 0; > struct smb3_acl acl; > > - *len = roundup(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8); > + *len = round_up(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8); > > if (set_owner) { > /* sizeof(struct owner_group_sids) is already multiple of 8 so no need to round */ > @@ -2487,7 +2483,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) > memcpy(aclptr, &acl, sizeof(struct smb3_acl)); > > buf->ccontext.DataLength = cpu_to_le32(ptr - (__u8 *)&buf->sd); > - *len = roundup(ptr - (__u8 *)buf, 8); > + *len = round_up((unsigned int)(ptr - (__u8 *)buf), 8); > > return buf; > } > @@ -2581,7 +2577,7 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len, > * final path needs to be 8-byte aligned as specified in > * MS-SMB2 2.2.13 SMB2 CREATE Request. > */ > - *out_size = roundup(*out_len * sizeof(__le16), 8); > + *out_size = round_up(*out_len * sizeof(__le16), 8); > *out_path = kzalloc(*out_size + sizeof(__le16) /* null */, GFP_KERNEL); > if (!*out_path) > return -ENOMEM; > @@ -2687,20 +2683,17 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, > uni_path_len = (2 * UniStrnlen((wchar_t *)utf16_path, PATH_MAX)) + 2; > /* MUST set path len (NameLength) to 0 opening root of share */ > req->NameLength = cpu_to_le16(uni_path_len - 2); > - if (uni_path_len % 8 != 0) { > - copy_size = roundup(uni_path_len, 8); > - copy_path = kzalloc(copy_size, GFP_KERNEL); > - if (!copy_path) { > - rc = -ENOMEM; > - goto err_free_req; > - } > - memcpy((char *)copy_path, (const char *)utf16_path, > - uni_path_len); > - uni_path_len = copy_size; > - /* free before overwriting resource */ > - kfree(utf16_path); > - utf16_path = copy_path; > + copy_size = round_up(uni_path_len, 8); > + copy_path = kzalloc(copy_size, GFP_KERNEL); > + if (!copy_path) { > + rc = -ENOMEM; > + goto err_free_req; > } > + memcpy((char *)copy_path, (const char *)utf16_path, uni_path_len); > + uni_path_len = copy_size; > + /* free before overwriting resource */ > + kfree(utf16_path); > + utf16_path = copy_path; > } > > iov[1].iov_len = uni_path_len; > @@ -2826,9 +2819,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, > uni_path_len = (2 * UniStrnlen((wchar_t *)path, PATH_MAX)) + 2; > /* MUST set path len (NameLength) to 0 opening root of share */ > req->NameLength = cpu_to_le16(uni_path_len - 2); > - copy_size = uni_path_len; > - if (copy_size % 8 != 0) > - copy_size = roundup(copy_size, 8); > + copy_size = round_up(uni_path_len, 8); > copy_path = kzalloc(copy_size, GFP_KERNEL); > if (!copy_path) > return -ENOMEM; > @@ -4090,7 +4081,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len, > if (request_type & CHAINED_REQUEST) { > if (!(request_type & END_OF_CHAIN)) { > /* next 8-byte aligned request */ > - *total_len = DIV_ROUND_UP(*total_len, 8) * 8; > + *total_len = ALIGN(*total_len, 8); > shdr->NextCommand = cpu_to_le32(*total_len); > } else /* END_OF_CHAIN */ > shdr->NextCommand = 0; > -- > 2.35.3 >