On 7/10/23 8:18 PM, Stanislav Fomichev wrote:
On 07/10, Daniel Borkmann wrote:
On 7/7/23 11:27 PM, Stanislav Fomichev wrote:
[...]
+static int bpf_mprog_prog(struct bpf_tuple *tuple,
+ u32 object, u32 flags,
+ enum bpf_prog_type type)
+{
+ bool id = flags & BPF_F_ID;
+ struct bpf_prog *prog;
+
+ if (id)
+ prog = bpf_prog_by_id(object);
+ else
+ prog = bpf_prog_get(object);
+ if (IS_ERR(prog)) {
[..]
+ if (!object && !id)
+ return 0;
What's the reason behind this?
If an fd was passed which is 0 and this was not a program fd, then we don't error
out and treat it as if no fd was passed.
Is this new api an opportunity to fix that fd==0? And always treat it as
valid. Or we have some other constrains elsewhere?
Not that I'm aware of, it should work fine in the new API.
+ return PTR_ERR(prog);
+ }
+ if (type && prog->type != type) {
+ bpf_prog_put(prog);
+ return -EINVAL;
+ }
+
+ tuple->link = NULL;
+ tuple->prog = prog;
+ return 0;
+}
[...]
+static int bpf_mprog_pos_before(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple)
+{
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ int i;
+
+ for (i = 0; i < bpf_mprog_total(entry); i++) {
+ bpf_mprog_read(entry, i, &fp, &cp);
+ if (tuple->prog == READ_ONCE(fp->prog) &&
Both attach/detach happen under rtnl, why do need READ_ONCE? I'm assuming
even going forwrad, attach/detach from non-tcx places will happen
under lock?
(same for bpf_mprog_pos_before/bpf_mprog_pos_after)
Feels like the only place where we need WRITE_ONCE is the replace (in-place)
and READ_ONCE during fast-path. Why do we need the rest?
Yes, the replace case is via WRITE_ONCE, hence the READ_ONCE annotations. You
are saying that for the cases where we are under lock we should just drop the
READ_ONCE annotations? I can do that ofc, I thought the general convention was
to do the {READ,WRITE}_ONCE consistently for the purpose of documenting fp->prog
access.
I see, then maybe let's keep them. I was a bit confused because those
READ_ONCE are within a locked section so I wasn't sure whether I'm
missing something or it's working as intended :-)
Okay. I added the explanation around locking in the big comment I sent in the
other thread to Alexei.
+ (!tuple->link || tuple->link == cp->link))
+ return i - 1;
+ }
+ return tuple->prog ? -ENOENT : -1;
+}
+
+static int bpf_mprog_pos_after(struct bpf_mprog_entry *entry,
+ struct bpf_tuple *tuple)
+{
+ struct bpf_mprog_fp *fp;
+ struct bpf_mprog_cp *cp;
+ int i;
+
+ for (i = 0; i < bpf_mprog_total(entry); i++) {
+ bpf_mprog_read(entry, i, &fp, &cp);
+ if (tuple->prog == READ_ONCE(fp->prog) &&
+ (!tuple->link || tuple->link == cp->link))
+ return i + 1;
+ }
+ return tuple->prog ? -ENOENT : bpf_mprog_total(entry);
+}
+
+int bpf_mprog_attach(struct bpf_mprog_entry *entry, struct bpf_prog *prog_new,
+ struct bpf_link *link, struct bpf_prog *prog_old,
+ u32 flags, u32 object, u64 revision)
+{
+ struct bpf_tuple rtuple, ntuple = {
+ .prog = prog_new,
+ .link = link,
+ }, otuple = {
+ .prog = prog_old,
+ .link = link,
+ };
+ int ret, idx = -2, tidx;
+
+ if (revision && revision != bpf_mprog_revision(entry))
+ return -ESTALE;
+ if (bpf_mprog_exists(entry, prog_new))
+ return -EEXIST;
+ ret = bpf_mprog_tuple_relative(&rtuple, object,
+ flags & ~BPF_F_REPLACE,
+ prog_new->type);
+ if (ret)
+ return ret;
+ if (flags & BPF_F_REPLACE) {
+ tidx = bpf_mprog_pos_exact(entry, &otuple);
+ if (tidx < 0) {
+ ret = tidx;
+ goto out;
+ }
+ idx = tidx;
+ }
[..]
+ if (flags & BPF_F_BEFORE) {
+ tidx = bpf_mprog_pos_before(entry, &rtuple);
+ if (tidx < -1 || (idx >= -1 && tidx != idx)) {
+ ret = tidx < -1 ? tidx : -EDOM;
+ goto out;
+ }
+ idx = tidx;
+ }
+ if (flags & BPF_F_AFTER) {
+ tidx = bpf_mprog_pos_after(entry, &rtuple);
+ if (tidx < 0 || (idx >= -1 && tidx != idx)) {
+ ret = tidx < 0 ? tidx : -EDOM;
+ goto out;
+ }
+ idx = tidx;
+ }
There still seems to be some inter-dependency between F_BEFORE and F_AFTER?
IOW, looks like I can pass F_BEFORE|F_AFTER|F_REPLACE. Do we need that?
Why not exclusive cases?
I reworked this as per Andrii's suggestion/preference from v2 [0], iow, to calculate
target index and bail out if the request cannot be resolved into a common index.
[0] https://lore.kernel.org/bpf/CAEf4BzbsUMnP7WMm3OmJznvD2b03B1qASFRNiDoVAU6XvvTZNA@xxxxxxxxxxxxxx/
SG! Let's maybe put a summary in the header of what the expectation is when
combining them?
Yes, will add a comment sounds good.
Thanks,
Daniel