On Tue, Sep 6, 2022 at 3:34 AM Enzo Matsumiya <ematsumiya@xxxxxxx> wrote:
But also replace (DIV_ROUND_UP(len, 8) * 8) with ALIGN(len, 8), which,
if not optimized by the compiler, has the overhead of a multiplication
and a division. Do the same for roundup() by replacing it by round_up()
(division-less version, but requires the multiple to be a power of 2,
which is always the case for us).
Many of these compile to the same division-less instructions
especially if any of the values are known at compile time.
@@ -2305,7 +2305,7 @@ int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *pTcon,
smb1 and computed at compile time
@@ -3350,8 +3350,7 @@ validate_ntransact(char *buf, char **ppparm, char **ppdata,
smb1
@@ -2833,9 +2833,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
@@ -2871,8 +2874,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
connect time
@@ -1318,7 +1313,7 @@ sess_auth_ntlmv2(struct sess_data *sess_data)
@@ -1442,8 +1437,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
@@ -1494,7 +1488,7 @@ sess_auth_kerberos(struct sess_data *sess_data)
@@ -1546,7 +1540,7 @@ _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data)
@@ -1747,7 +1741,7 @@ sess_auth_rawntlmssp_authenticate(struct sess_data *sess_data)
connect time
@@ -207,7 +207,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
- size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */
+ size[0] = sizeof(u8); /* See MS-FSCC section 2.4.11 */
- size[0] = 8; /* sizeof __le64 */
+ size[0] = sizeof(__le64);
Hot path but no functional change
@@ -248,7 +248,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *server)
* Some windows servers (win2016) will pad also the final
* PDU in a compound to 8 bytes.
*/
- if (((calc_len + 7) & ~7) == len)
+ if (ALIGN(calc_len, 8) == len)
Hot path but should compile to the same thing
@@ -466,15 +466,14 @@ build_signing_ctxt(struct smb2_signing_capabilities *pneg_ctxt)
@@ -511,8 +510,7 @@ build_netname_ctxt(struct smb2_netname_neg_context *pneg_ctxt, char *hostname)
@@ -557,18 +555,18 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
@@ -595,9 +593,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
@@ -780,7 +776,7 @@ static int smb311_decode_neg_context(struct smb2_negotiate_rsp *rsp,
connect time
@@ -2413,7 +2409,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
@@ -2487,7 +2483,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len)
only relevant if mounted with some acl flags
@@ -2581,7 +2577,7 @@ alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len,
- *out_size = roundup(*out_len * sizeof(__le16), 8);
+ *out_size = round_up(*out_len * sizeof(__le16), 8);
Hot path but from my experiments, round_up() compiles to an *extra*
instruction. See below.
@@ -2687,20 +2683,17 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
Only relevant on posix mounts.
@@ -2826,9 +2819,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
- copy_size = uni_path_len;
- if (copy_size % 8 != 0)
- copy_size = roundup(copy_size, 8);
+ copy_size = round_up(uni_path_len, 8);
@@ -4090,7 +4081,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len,
- *total_len = DIV_ROUND_UP(*total_len, 8) * 8;
+ *total_len = ALIGN(*total_len, 8);
These 2 are also hot paths, but skeptical about the optimizations.
I've looked at those macros in Compiler Explorer and sure enough they
compile to the same thing on x86_64.
Even worse, the optimized versions compile with extra instructions for some:
https://godbolt.org/z/z1xhhW9sj
size_t mod2(size_t num) {
return num%2;
}
size_t is_aligned2(size_t num) {
return IS_ALIGNED(num, 2);
}
mod2:
mov rax, rdi
and eax, 1
ret
is_aligned2:
mov rax, rdi
not rax <=== extra
and eax, 1
ret
--------------------------
size_t align8(size_t num) {
return ALIGN(num, 8);
}
size_t align_andshift8(size_t num) {
return ((num + 7) & ~7);
}
align8:
lea rax, [rdi+7]
and rax, -8
ret
align_andshift8:
lea rax, [rdi+7]
and rax, -8
ret
same code
--------------------------
size_t dru8(size_t num) {
return DIV_ROUND_UP(num, 8)*8;
}
size_t rnup8_a(size_t num) {
return round_up(num, 8);
}
size_t rnup8_b(size_t num) {
return roundup(num, 8);
}
dru8:
lea rax, [rdi+7]
and rax, -8
ret
rnup8_a:
lea rax, [rdi-1]
or rax, 7 <==== extra
add rax, 1
ret
rnup8_b:
lea rax, [rdi+7]
and rax, -8
ret
round_up has an extra instruction
--------------------------
I suspect the improvements are more likely to be related to caches,
system load, server load, ...
You can try to use perf to make a flamegraph and compare.
Cheers,