On Sun, March 18, 2012 23:52, Eric Dumazet wrote: > Le dimanche 18 mars 2012 à 19:35 +1100, Indan Zupancic a écrit : > >> Yes. The main difference would be that the JIT could always generate imm8 >> offsets, saving 4 bytes per long offset while also simplifying the compiler >> code. The %rdi + 127 is 4 bytes, and if the rest become slightly faster >> because they're imm8 then it's worth the extra instruction. >> > > Do you understand you try to save 3 bytes in the function prolog, but > your single EMIT4(0x48, 0x83, 0xc7, 127); /* addq $127,%rdi */ > defeats this ? That's not what I'm trying to do, I'm trying to simplify the code so it's easier to verify and less cluttered. I admit the fixup in bpf_slow_path_common makes it a bad idea over all. But if that one extra addq would have been all that was needed then it would have been worth it I think. Even if it makes the prologue one byte longer. > > Ancillary instructions are rarely used, libpcap for example doesnt have > support for them. > >> I first thought the +127 could be done in two bytes, but 4 bytes are >> needed, so maybe it's not worth it. > ... >> The add 127 would be at the start, the first instruction using it would >> be a couple of instructions later, so I don't think the dependency is a >> problem. >> >> You're right about skb_copy_bits(), I did a quick search for rdi usage >> but missed it was the first parameter too. It would need one extra >> sub 127 or add -127 in the slow path, after the push. But it's the slow >> path already, one extra instruction won't make much difference. > > It will, because new NIC drivers tend to provide skbs with fragments. If it does then perhaps the fast path should be made faster by inlining the code instead of calling a function which may not be cached. > > Using libpcap filter like "udp[100]" calls the skb_copy_bits() helper in > this case. > > There is no difference in instruction timing using offset32 or offset8, > so the code you add will slow the filter anyway. I assumed optimising for imm8 was worthwile, but if it's the same speed, saving a few bytes here and there while wasting kilobytes of memory any way doesn't make much sense. Greetings, Indan [PATCH] net: bpf_jit: Simplify code by always using offset8 or offset32. Instruction timing of offset32 or offset8 is the same, do not bother saving a few bytes of code here and there. Only use offset8 for skb.len, skb.data_len and skb.dev: It is very unlikely that those fields are ever moved to the end of sk_buff. The other fields are used in ancillary instructions, for those always use offset32. Signed-off-by: Indan Zupancic <indan@xxxxxx> arch/x86/net/bpf_jit_comp.c | 104 +++++++++++++------------------------------ 1 files changed, 31 insertions(+), 73 deletions(-) --- diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 7c1b765..5ddb82b 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -102,6 +102,12 @@ do { \ f_op = FOP; \ goto cond_branch +#define SKB_OFF8(field) ({ \ + int _off = offsetof(struct sk_buff, field); \ + BUILD_BUG_ON(_off > 127); \ + _off; \ + }) + #define SEEN_DATAREF 1 /* might call external helpers */ #define SEEN_XREG 2 /* ebx is used */ @@ -172,30 +178,13 @@ void bpf_jit_compile(struct sk_filter *fp) * r8 = skb->data */ if (seen_or_pass0 & SEEN_DATAREF) { - if (offsetof(struct sk_buff, len) <= 127) - /* mov off8(%rdi),%r9d */ - EMIT4(0x44, 0x8b, 0x4f, offsetof(struct sk_buff, len)); - else { - /* mov off32(%rdi),%r9d */ - EMIT3(0x44, 0x8b, 0x8f); - EMIT(offsetof(struct sk_buff, len), 4); - } - if (is_imm8(offsetof(struct sk_buff, data_len))) - /* sub off8(%rdi),%r9d */ - EMIT4(0x44, 0x2b, 0x4f, offsetof(struct sk_buff, data_len)); - else { - EMIT3(0x44, 0x2b, 0x8f); - EMIT(offsetof(struct sk_buff, data_len), 4); - } - - if (is_imm8(offsetof(struct sk_buff, data))) - /* mov off8(%rdi),%r8 */ - EMIT4(0x4c, 0x8b, 0x47, offsetof(struct sk_buff, data)); - else { - /* mov off32(%rdi),%r8 */ - EMIT3(0x4c, 0x8b, 0x87); - EMIT(offsetof(struct sk_buff, data), 4); - } + /* mov off8(%rdi),%r9d */ + EMIT4(0x44, 0x8b, 0x4f, SKB_OFF8(len)); + /* sub off8(%rdi),%r9d */ + EMIT4(0x44, 0x2b, 0x4f, SKB_OFF8(data_len)); + /* mov off32(%rdi),%r8 */ + EMIT3(0x4c, 0x8b, 0x87); + EMIT(offsetof(struct sk_buff, data), 4); } } @@ -391,43 +380,24 @@ void bpf_jit_compile(struct sk_filter *fp) break; case BPF_S_LD_W_LEN: /* A = skb->len; */ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); - if (is_imm8(offsetof(struct sk_buff, len))) - /* mov off8(%rdi),%eax */ - EMIT3(0x8b, 0x47, offsetof(struct sk_buff, len)); - else { - EMIT2(0x8b, 0x87); - EMIT(offsetof(struct sk_buff, len), 4); - } + /* mov off8(%rdi),%eax */ + EMIT3(0x8b, 0x47, SKB_OFF8(len)); break; case BPF_S_LDX_W_LEN: /* X = skb->len; */ seen |= SEEN_XREG; - if (is_imm8(offsetof(struct sk_buff, len))) - /* mov off8(%rdi),%ebx */ - EMIT3(0x8b, 0x5f, offsetof(struct sk_buff, len)); - else { - EMIT2(0x8b, 0x9f); - EMIT(offsetof(struct sk_buff, len), 4); - } + /* mov off8(%rdi),%ebx */ + EMIT3(0x8b, 0x5f, SKB_OFF8(len)); break; case BPF_S_ANC_PROTOCOL: /* A = ntohs(skb->protocol); */ BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); - if (is_imm8(offsetof(struct sk_buff, protocol))) { - /* movzwl off8(%rdi),%eax */ - EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, protocol)); - } else { - EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */ - EMIT(offsetof(struct sk_buff, protocol), 4); - } + /* movzwl off32(%rdi),%eax */ + EMIT3(0x0f, 0xb7, 0x87); + EMIT(offsetof(struct sk_buff, protocol), 4); EMIT2(0x86, 0xc4); /* ntohs() : xchg %al,%ah */ break; case BPF_S_ANC_IFINDEX: - if (is_imm8(offsetof(struct sk_buff, dev))) { - /* movq off8(%rdi),%rax */ - EMIT4(0x48, 0x8b, 0x47, offsetof(struct sk_buff, dev)); - } else { - EMIT3(0x48, 0x8b, 0x87); /* movq off32(%rdi),%rax */ - EMIT(offsetof(struct sk_buff, dev), 4); - } + /* movq off8(%rdi),%rax */ + EMIT4(0x48, 0x8b, 0x47, SKB_OFF8(dev)); EMIT3(0x48, 0x85, 0xc0); /* test %rax,%rax */ EMIT_COND_JMP(X86_JE, cleanup_addr - (addrs[i] - 6)); BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); @@ -436,33 +406,21 @@ void bpf_jit_compile(struct sk_filter *fp) break; case BPF_S_ANC_MARK: BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); - if (is_imm8(offsetof(struct sk_buff, mark))) { - /* mov off8(%rdi),%eax */ - EMIT3(0x8b, 0x47, offsetof(struct sk_buff, mark)); - } else { - EMIT2(0x8b, 0x87); - EMIT(offsetof(struct sk_buff, mark), 4); - } + /* mov off32(%rdi),%eax */ + EMIT2(0x8b, 0x87); + EMIT(offsetof(struct sk_buff, mark), 4); break; case BPF_S_ANC_RXHASH: BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, rxhash) != 4); - if (is_imm8(offsetof(struct sk_buff, rxhash))) { - /* mov off8(%rdi),%eax */ - EMIT3(0x8b, 0x47, offsetof(struct sk_buff, rxhash)); - } else { - EMIT2(0x8b, 0x87); - EMIT(offsetof(struct sk_buff, rxhash), 4); - } + /* mov off32(%rdi),%eax */ + EMIT2(0x8b, 0x87); + EMIT(offsetof(struct sk_buff, rxhash), 4); break; case BPF_S_ANC_QUEUE: BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); - if (is_imm8(offsetof(struct sk_buff, queue_mapping))) { - /* movzwl off8(%rdi),%eax */ - EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, queue_mapping)); - } else { - EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */ - EMIT(offsetof(struct sk_buff, queue_mapping), 4); - } + /* movzwl off32(%rdi),%eax */ + EMIT3(0x0f, 0xb7, 0x87); + EMIT(offsetof(struct sk_buff, queue_mapping), 4); break; case BPF_S_ANC_CPU: #ifdef CONFIG_SMP -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html