From: Joe Burton <jevburton@xxxxxxxxxx> Add an example that demonstrates how map tracing helps us avoid race conditions while upgrading stateful program. Signed-off-by: Joe Burton <jevburton@xxxxxxxxxx> --- .../selftests/bpf/prog_tests/bpf_map_trace.c | 257 ++++++++++++++++++ .../progs/bpf_map_trace_real_world_common.h | 125 +++++++++ .../bpf_map_trace_real_world_migration.c | 96 +++++++ .../bpf/progs/bpf_map_trace_real_world_new.c | 4 + .../bpf/progs/bpf_map_trace_real_world_old.c | 5 + 5 files changed, 487 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/bpf_map_trace_real_world_common.h create mode 100644 tools/testing/selftests/bpf/progs/bpf_map_trace_real_world_migration.c create mode 100644 tools/testing/selftests/bpf/progs/bpf_map_trace_real_world_new.c create mode 100644 tools/testing/selftests/bpf/progs/bpf_map_trace_real_world_old.c diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_map_trace.c b/tools/testing/selftests/bpf/prog_tests/bpf_map_trace.c index 89bae9a83339..cd60a0c78202 100644 --- a/tools/testing/selftests/bpf/prog_tests/bpf_map_trace.c +++ b/tools/testing/selftests/bpf/prog_tests/bpf_map_trace.c @@ -19,6 +19,9 @@ #include "bpf_map_trace_delete_elem.skel.h" #include "bpf_map_trace_loop0.skel.h" #include "bpf_map_trace_loop1.skel.h" +#include "bpf_map_trace_real_world_migration.skel.h" +#include "bpf_map_trace_real_world_new.skel.h" +#include "bpf_map_trace_real_world_old.skel.h" #include "bpf_map_trace_update_elem.skel.h" uint32_t collatz(uint32_t x) @@ -130,8 +133,262 @@ void infinite_loop__indirect(void) bpf_map_trace_loop1__destroy(skel); } +int real_world_example__attach_migration( + struct bpf_map_trace_real_world_migration *migration_skel, + struct bpf_link **iter_link, + struct bpf_link **map_trace_link_update, + struct bpf_link **map_trace_link_delete) +{ + union bpf_iter_link_info iter_link_info; + struct bpf_iter_attach_opts iter_opts; + int64_t error; + + *map_trace_link_update = bpf_program__attach( + migration_skel->progs.copy_on_write__update); + error = libbpf_get_error(map_trace_link_update); + if (!ASSERT_EQ(error, 0, + "copy_on_write update bpf_program__attach failure")) + return 1; + + *map_trace_link_delete = bpf_program__attach( + migration_skel->progs.copy_on_write__delete); + error = libbpf_get_error(map_trace_link_delete); + if (!ASSERT_EQ(error, 0, + "copy_on_write update bpf_program__delete failure")) + return 1; + + memset(&iter_link_info, 0, sizeof(iter_link_info)); + iter_link_info.map.map_fd = bpf_map__fd(migration_skel->maps.old_map); + + memset(&iter_opts, 0, sizeof(iter_opts)); + iter_opts.sz = sizeof(iter_opts); + iter_opts.link_info = &iter_link_info; + iter_opts.link_info_len = sizeof(iter_link_info); + *iter_link = bpf_program__attach_iter( + migration_skel->progs.bulk_migration, &iter_opts); + error = libbpf_get_error(iter_link); + if (!ASSERT_EQ(error, 0, "bpf_program__attach_iter failure")) + return 1; + + return 0; +} + +int open_and_write_files(const char *path, size_t num_files) +{ + int *fds = malloc(sizeof(int) * num_files); + ssize_t bytes_written; + const char buf = 'a'; + size_t i, j; + int ret = 0; + + if (fds == NULL) + return 1; + + for (i = 0; i < num_files; i++) { + fds[i] = open(path, O_WRONLY | O_CREAT); + + if (fds[i] < 0) { + ret = 2; + break; + } + bytes_written = write(fds[i], &buf, sizeof(buf)); + if (bytes_written != sizeof(buf)) { + ret = 3; + break; + } + } + for (j = 0; j < i; j++) + close(fds[j]); + return ret; +} + +void real_world_example(void) +{ + struct bpf_map_trace_real_world_migration *migration_skel = NULL; + int file_fd_should_write = -1, file_fd_should_not_write = -1; + struct bpf_map_trace_real_world_new *new_skel = NULL; + struct bpf_map_trace_real_world_old *old_skel = NULL; + struct bpf_link *map_trace_link_update = NULL; + struct bpf_link *map_trace_link_delete = NULL; + struct bpf_link *iter_link = NULL; + const bool enable_filtering = 1; + const uint32_t pid = getpid(); + uint32_t max_open_files; + char file_buf = 'a'; + int iter_fd = -1; + char iter_buf[1]; + int rc; + + /* + * Begin by loading and attaching the old version of our program. + */ + old_skel = bpf_map_trace_real_world_old__open_and_load(); + if (!ASSERT_NEQ(old_skel, NULL, "open/load old skeleton")) + return; + rc = bpf_map_trace_real_world_old__attach(old_skel); + if (!ASSERT_EQ(rc, 0, "attach old skeleton")) { + fprintf(stderr, "Failed to attach skeleton: %d\n", errno); + goto out; + } + rc = bpf_map_update_elem(bpf_map__fd(old_skel->maps.filtered_pids), + &pid, &enable_filtering, /*flags=*/0); + if (!ASSERT_EQ(rc, 0, "configure process to be filtered")) + return; + if (!ASSERT_EQ(open_and_write_files("/tmp/tst_file", 1), 0, + "program allows writing a single new file")) + goto out; + max_open_files = bpf_map__max_entries(old_skel->maps.allow_reads); + if (!ASSERT_NEQ(open_and_write_files("/tmp/tst_file", + max_open_files + 1), 0, + "program blocks writing too many new files")) + goto out; + + /* + * Then load the new version of the program. + */ + new_skel = bpf_map_trace_real_world_new__open_and_load(); + if (!ASSERT_NEQ(new_skel, NULL, "open/load new skeleton")) + goto out; + + /* + * Hook up the migration programs. This gives the old map + * copy-on-write semantics. + */ + migration_skel = bpf_map_trace_real_world_migration__open(); + if (!ASSERT_NEQ(migration_skel, NULL, "open migration skeleton")) + goto out; + rc = bpf_map__reuse_fd(migration_skel->maps.old_map, + bpf_map__fd(old_skel->maps.allow_reads)); + if (!ASSERT_EQ(rc, 0, "reuse old map fd")) + goto out; + rc = bpf_map__reuse_fd(migration_skel->maps.new_map, + bpf_map__fd(new_skel->maps.allow_reads)); + if (!ASSERT_EQ(rc, 0, "reuse new map fd")) + goto out; + rc = bpf_map_trace_real_world_migration__load(migration_skel); + if (!ASSERT_EQ(rc, 0, "load migration skeleton")) + goto out; + rc = real_world_example__attach_migration(migration_skel, + &iter_link, + &map_trace_link_update, + &map_trace_link_delete); + if (!ASSERT_EQ(rc, 0, "attach migration programs")) + goto out; + + /* + * Simulated race condition type 1: An application opens an fd before + * bulk transfer and closes it after. + */ + file_fd_should_not_write = open("/tmp/tst_file", O_WRONLY | O_CREAT); + if (!ASSERT_GE(file_fd_should_not_write, 0, + "open file before bulk migration")) + goto out; + + /* + * Perform bulk transfer. + */ + iter_fd = bpf_iter_create(bpf_link__fd(iter_link)); + if (!ASSERT_GE(iter_fd, 0, "create iterator")) + goto out; + rc = read(iter_fd, &iter_buf, sizeof(iter_buf)); + if (!ASSERT_EQ(rc, 0, "execute map iterator")) + goto out; + rc = bpf_map_update_elem(bpf_map__fd(new_skel->maps.filtered_pids), + &pid, &enable_filtering, /*flags=*/0); + if (!ASSERT_EQ(rc, 0, "configure process to be filtered")) + goto out; + + /* + * Simulated race condition type 1 (continued). This close() does not + * propagate to the new map without copy-on-write semantics, so it + * would occupy a spot in the map until our app happens to close an fd + * with the same number. This would subtly degrade the contract with + * the application. + */ + close(file_fd_should_not_write); + file_fd_should_not_write = -1; + + /* + * Simulated race condition type 2: An application opens a file + * descriptor after bulk transfer. This openat() does not propagate to + * the new map without copy-on-write, so our app would not be able to + * write to it. + */ + file_fd_should_write = open("/tmp/tst_file", O_WRONLY | O_CREAT); + if (!ASSERT_GE(file_fd_should_write, 0, + "open file after bulk migration")) + goto out; + + /* + * State is migrated. Load new programs. + */ + rc = bpf_map_trace_real_world_new__attach(new_skel); + if (!ASSERT_EQ(rc, 0, "failed to attach new programs")) + goto out; + + /* + * Unload migration progs. + */ + close(iter_fd); + iter_fd = -1; + bpf_link__destroy(map_trace_link_update); + map_trace_link_update = NULL; + bpf_link__destroy(map_trace_link_delete); + map_trace_link_delete = NULL; + bpf_link__destroy(iter_link); + iter_link = NULL; + bpf_map_trace_real_world_migration__destroy(migration_skel); + migration_skel = NULL; + + /* + * Unload old programs. + */ + bpf_map_trace_real_world_old__destroy(old_skel); + old_skel = NULL; + + if (!ASSERT_EQ(open_and_write_files("/tmp/tst_file", 1), 0, + "program allows writing a single new file")) + goto out; + max_open_files = bpf_map__max_entries(new_skel->maps.allow_reads); + if (!ASSERT_NEQ(open_and_write_files("/tmp/tst_file", + max_open_files + 1), 0, + "program blocks writing too many new files")) + goto out; + /* + * Simulated race condition type 2 (continued): If we didn't do + * copy-on-write, this would be expected to fail, since the FD would + * not be in the new map. + */ + rc = write(file_fd_should_write, &file_buf, sizeof(file_buf)); + if (!ASSERT_EQ(rc, sizeof(file_buf), + "migrated program allows writing to file opened before migration")) + goto out; + +out: + if (old_skel) + bpf_map_trace_real_world_old__destroy(old_skel); + if (new_skel) + bpf_map_trace_real_world_new__destroy(new_skel); + if (migration_skel) + bpf_map_trace_real_world_migration__destroy(migration_skel); + if (map_trace_link_update) + bpf_link__destroy(map_trace_link_update); + if (map_trace_link_delete) + bpf_link__destroy(map_trace_link_delete); + if (iter_link) + bpf_link__destroy(iter_link); + if (iter_fd > -1) + close(iter_fd); + if (file_fd_should_write > -1) + close(file_fd_should_write); + if (file_fd_should_not_write > -1) + close(file_fd_should_not_write); +} + void test_bpf_map_trace(void) { + if (test__start_subtest("real_world_example")) + real_world_example(); if (test__start_subtest("update_elem__basic")) update_elem__basic(); if (test__start_subtest("delete_elem__basic")) diff --git a/tools/testing/selftests/bpf/progs/bpf_map_trace_real_world_common.h b/tools/testing/selftests/bpf/progs/bpf_map_trace_real_world_common.h new file mode 100644 index 000000000000..230610e1b5d5 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_map_trace_real_world_common.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (c) 2021 Google */ +#pragma once + +#include "vmlinux.h" + +#include <bpf/bpf_helpers.h> +#include <bpf/bpf_tracing.h> +#include <errno.h> +#include <string.h> + +/* + * Mock "real world" application. + * + * Blocks all writes from a set of applications. A limited number of newly + * openat()ed file descriptors file descriptors may be written to. Writes to + * already-open file descriptors are blocked. + * + * The affected processes are selected by populating filtered_pid. + * + * It is intended as an example of a stateful policy-enforcement application + * which benefits from map tracing. It is not intended to be useful. + */ + +/* + * This is the only difference between the old and new application. Since we're + * enforcing a policy based on this data, we want to migrate it. Since the + * application can modify the data in parallel, we need to give this map + * copy-on-write semantics so that those changes propagate. + */ +#if defined(OLD_VERSION) +struct allow_reads_key { + uint32_t pid; + int fd; +}; +#else +struct allow_reads_key { + int fd; + uint32_t pid; +}; +#endif +struct allow_reads_value { + bool do_allow; +}; +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 16); + __type(key, struct allow_reads_key); + __type(value, struct allow_reads_value); +} allow_reads SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 16); + __type(key, uint32_t); + __type(value, bool); +} filtered_pids SEC(".maps"); + + +SEC("kretprobe/__x64_sys_openat") +int BPF_KRETPROBE(kretprobe__x64_sys_openat, int ret) +{ + struct allow_reads_key key; + struct allow_reads_value val; + uint32_t pid; + char *pid_is_filtered; + + pid = (bpf_get_current_pid_tgid() >> 32) & 0xFFFFFFFF; + memset(&key, 0, sizeof(key)); + key.pid = pid; + key.fd = ret; + val.do_allow = true; + + if (ret < 0) + return 0; + + pid_is_filtered = bpf_map_lookup_elem(&filtered_pids, &pid); + if (!pid_is_filtered) + return 0; + + if (!*pid_is_filtered) + return 0; + + /* + * Ignore errors. Failing to insert has the effect of blocking writes + * on that file descriptor. + */ + bpf_map_update_elem(&allow_reads, &key, &val, /*flags=*/0); + return 0; +} + +SEC("fmod_ret/__x64_sys_write") +int BPF_PROG(fmod_ret__x64_sys_write, struct pt_regs *regs, int ret) +{ + int fd = PT_REGS_PARM1(regs); + struct allow_reads_value *val; + struct allow_reads_key key; + + memset(&key, 0, sizeof(key)); + key.pid = (bpf_get_current_pid_tgid() >> 32) & 0xFFFFFFFF; + key.fd = fd; + val = bpf_map_lookup_elem(&allow_reads, &key); + if (!val) + return -EPERM; + return val->do_allow ? 0 : -EPERM; +} + +SEC("fmod_ret/__x64_sys_close") +int BPF_PROG(fmod_ret__x64_sys_close, struct pt_regs *regs, int ret) +{ + int fd = PT_REGS_PARM1(regs); + struct allow_reads_key key; + struct allow_reads_value val; + + memset(&key, 0, sizeof(key)); + key.pid = (bpf_get_current_pid_tgid() >> 32) & 0xFFFFFFFF; + key.fd = fd; + val.do_allow = true; + + bpf_map_delete_elem(&allow_reads, &key); + return 0; +} + +char _license[] SEC("license") = "GPL"; + diff --git a/tools/testing/selftests/bpf/progs/bpf_map_trace_real_world_migration.c b/tools/testing/selftests/bpf/progs/bpf_map_trace_real_world_migration.c new file mode 100644 index 000000000000..d0a37ae1be26 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_map_trace_real_world_migration.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Google */ +#include "vmlinux.h" + +#include <bpf/bpf_helpers.h> + +/* In the "real" real world, we would use BTF to generate a program which knows + * about the old and new map ABI. To keep things simple we'll just use a + * statically defined program which knows about them. + */ +struct allow_reads_key__old { + uint32_t pid; + int fd; +}; +struct allow_reads_key__new { + int fd; + uint32_t pid; +}; +struct allow_reads_value__old { + bool do_drop; +}; +struct allow_reads_value__new { + bool do_drop; +}; + +/* Likewise, in the "real" real world we would simply generate a program + * containing the fd of this map. For libbpf to generate a skeleton for us we + * need to dupicate this definition. + */ +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 100); + __type(key, struct allow_reads_key__old); + __type(value, struct allow_reads_value__old); +} old_map SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 100); + __type(key, struct allow_reads_key__new); + __type(value, struct allow_reads_value__new); +} new_map SEC(".maps"); + +static inline void read_migrate_write(void *key, void *value) +{ + struct allow_reads_key__old old_key = {}; + struct allow_reads_key__new new_key = {}; + char old_value = 0; + + if (bpf_probe_read(&old_key, sizeof(old_key), key)) + return; /* Could write to a map here */ + if (bpf_probe_read(&old_value, sizeof(old_value), value)) + return; /* Could write to a map here */ + + new_key.pid = old_key.pid; + new_key.fd = old_key.fd; + + bpf_map_update_elem(&new_map, &new_key, &old_value, /*flags=*/0); +} + +SEC("map_trace/old_map/UPDATE_ELEM") +int copy_on_write__update(struct bpf_map_trace_ctx__update_elem *ctx) +{ + read_migrate_write(ctx->key, ctx->value); + return 0; +} + +static inline void read_migrate_delete(void *key) +{ + struct allow_reads_key__old old_key = {}; + struct allow_reads_key__new new_key = {}; + + if (bpf_probe_read(&old_key, sizeof(old_key), key)) + return; /* Could write to a map here */ + + new_key.pid = old_key.pid; + new_key.fd = old_key.fd; + + bpf_map_delete_elem(&new_map, &new_key); +} + +SEC("map_trace/old_map/DELETE_ELEM") +int copy_on_write__delete(struct bpf_map_trace_ctx__delete_elem *ctx) +{ + read_migrate_delete(ctx->key); + return 0; +} + +SEC("iter/bpf_map_elem") +int bulk_migration(struct bpf_iter__bpf_map_elem *ctx) +{ + read_migrate_write(ctx->key, ctx->value); + return 0; +} + +char _license[] SEC("license") = "GPL"; + diff --git a/tools/testing/selftests/bpf/progs/bpf_map_trace_real_world_new.c b/tools/testing/selftests/bpf/progs/bpf_map_trace_real_world_new.c new file mode 100644 index 000000000000..9b7c4ca1deed --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_map_trace_real_world_new.c @@ -0,0 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Google */ +#include "bpf_map_trace_real_world_common.h" + diff --git a/tools/testing/selftests/bpf/progs/bpf_map_trace_real_world_old.c b/tools/testing/selftests/bpf/progs/bpf_map_trace_real_world_old.c new file mode 100644 index 000000000000..9f0bdd7baf71 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/bpf_map_trace_real_world_old.c @@ -0,0 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2021 Google */ +#define OLD_VERSION +#include "bpf_map_trace_real_world_common.h" + -- 2.33.0.685.g46640cef36-goog