The uptr KV store is a dynamically resizable key-value store that aims to make rolling out bpf programs with map value layout changes easier by hiding the layout from bpf program. It is built on top of existing bpf features with both user space and bpf API. To support usage in bpf programs on hot paths, only simple APIs such as get/put/delete are provided in bpf, and space managing API are available in user space API. To use uptr KV store, the user space program first needs to call kv_store_init() to allocate memory and setup uptrs in the task_local_storage of a given process. It will return a pointer to "struct kv_store" on success, which will be used as a token to access the KV store in other APIs. Secondly, it needs to initialize all key-value pairs with kv_store_put(). Then, both bpf and user space program can start their normal operation. In the bpf program, the API is designed to minimize map lookups. Therefore, the bpf program needs to first lookup the task_local_storage. Then, all bpf API will take the map value as the first argument, and these API do not incur additional map lookups. A simple way of using KV store to allow easy bpf program rollout is to use multiple key-value pairs where the values are primitive datatypes instead of a structure. This way, adding/deleting fields are just adding/deleting keys without moving data. The following is an example of how this would work. user space: kv_store_init() user space: kv_store_put({key1, key2, key3}) prog_v1: kv_store_get/put({key1, key2, key3}) user space: kv_store_delete{key1} user space: kv_store_add{key4} user space: kv_store_set_map_reuse(prog_v2.data_map) prog_v2: kv_store_get/put({key2, key3, key4}) At the core of the KV store are metadata and data. To access the value stored in the data, the metadata is first queried using an integer key. The metadata is an array of metadata containing the offset and size of the values. Both metadata and data are stored in uptr regions in the task_local_storage. Note that, it is also possible to support string keys by replacing the backing storage of metadata with an hashmap. However, the additional map lookup per API may suggest higher performance overhead. Signed-off-by: Amery Hung <ameryhung@xxxxxxxxx> --- .../selftests/bpf/prog_tests/uptr_kv_store.c | 282 ++++++++++++++++++ .../selftests/bpf/prog_tests/uptr_kv_store.h | 22 ++ .../selftests/bpf/progs/uptr_kv_store.h | 120 ++++++++ .../selftests/bpf/uptr_kv_store_common.h | 47 +++ 4 files changed, 471 insertions(+) create mode 100644 tools/testing/selftests/bpf/prog_tests/uptr_kv_store.c create mode 100644 tools/testing/selftests/bpf/prog_tests/uptr_kv_store.h create mode 100644 tools/testing/selftests/bpf/progs/uptr_kv_store.h create mode 100644 tools/testing/selftests/bpf/uptr_kv_store_common.h diff --git a/tools/testing/selftests/bpf/prog_tests/uptr_kv_store.c b/tools/testing/selftests/bpf/prog_tests/uptr_kv_store.c new file mode 100644 index 000000000000..18328b1d5a9a --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/uptr_kv_store.c @@ -0,0 +1,282 @@ +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +#include <sys/mman.h> +#include <linux/err.h> +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#include "task_local_storage_helpers.h" +#include "uptr_kv_store.h" + +struct kv_store { + int data_map_fd; + int task_fd; + int page_cnt; + char *data_map_pin_path; + struct kv_store_data_map_value data; +}; + +static struct kv_store_page *__kv_store_add_page(struct kv_store *kvs) +{ + struct kv_store_page *p; + + if (kvs->page_cnt > KVS_MAX_PAGE_ENTRIES) + return ERR_PTR(-ENOSPC); + + p = mmap(NULL, sizeof(struct kv_store_page), PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (p == MAP_FAILED) + return ERR_PTR(-ENOMEM); + + kvs->data.pages[kvs->page_cnt].page = p; + kvs->page_cnt++; + + return p; +} + +static void __kv_store_del_page(struct kv_store *kvs) +{ + struct kv_store_page *p; + + p = kvs->data.pages[kvs->page_cnt - 1].page; + kvs->data.pages[kvs->page_cnt - 1].page = NULL; + kvs->page_cnt--; + munmap(p, sizeof(*p)); +} + +static struct kv_store_meta *kvs_store_get_meta(struct kv_store *kvs, int key) +{ + return key < KVS_MAX_VAL_ENTRIES ? &kvs->data.metas->meta[key] : NULL; +} + +void kv_store_close(struct kv_store *kvs) +{ + int i; + + munmap(kvs->data.metas, sizeof(struct kv_store_metas)); + + for (i = 0; i < kvs->page_cnt; i++) + __kv_store_del_page(kvs); + + if (kvs->data_map_pin_path) + unlink(kvs->data_map_pin_path); + + free(kvs); +} + +struct kv_store *kv_store_init(int pid, struct bpf_map *data_map, const char *pin_path) +{ + struct kv_store_page *p; + struct kv_store *kvs; + int err; + + kvs = calloc(1, sizeof(*kvs)); + if (!kvs) { + errno = -ENOMEM; + return NULL; + } + + kvs->data.metas = mmap(NULL, sizeof(struct kv_store_page), + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (kvs->data.metas == MAP_FAILED) { + errno = -ENOMEM; + return NULL; + } + + p = __kv_store_add_page(kvs); + if (IS_ERR(p)) { + errno = PTR_ERR(p); + goto err; + } + + kvs->data_map_fd = bpf_map__fd(data_map); + if (!kvs->data_map_fd) { + errno = -ENOENT; + goto err; + } + + kvs->task_fd = sys_pidfd_open(pid, 0); + if (!kvs->task_fd) { + errno = -ESRCH; + goto err; + } + + err = bpf_map_update_elem(kvs->data_map_fd, &kvs->task_fd, &kvs->data, 0); + if (err) { + errno = err; + goto err; + } + + kvs->data_map_pin_path = strdup(pin_path); + if (!kvs->data_map_pin_path) + goto err; + + err = bpf_map__pin(data_map, kvs->data_map_pin_path); + if (err) { + errno = err; + goto err; + } + + return kvs; +err: + kv_store_close(kvs); + return NULL; +} + +int kv_store_data_map_set_reuse(struct kv_store *kvs, struct bpf_map *data_map) +{ + return bpf_map__reuse_fd(data_map, kvs->data_map_fd); +} + +void *kv_store_get(struct kv_store *kvs, int key) +{ + struct kv_store_meta *meta; + struct kv_store_page *p; + + meta = kvs_store_get_meta(kvs, key); + if (!meta || !meta->init) + return NULL; + + p = kvs->data.pages[meta->page_idx].page; + + return p->data + meta->page_off; +} + +static int linear_off(const struct kv_store_meta *meta) +{ + if (!meta->init) + return KVS_MAX_PAGE_ENTRIES * KVS_MAX_VAL_SIZE; + + return meta->page_idx * KVS_MAX_VAL_SIZE + meta->page_off; +} + +static int comp_meta(const void *m1, const void *m2) +{ + struct kv_store_meta *meta1 = (struct kv_store_meta *)m1; + struct kv_store_meta *meta2 = (struct kv_store_meta *)m2; + int off1, off2; + + off1 = linear_off(meta1); + off2 = linear_off(meta2); + + if (off1 > off2) + return 1; + else if (off1 < off2) + return -1; + else + return 0; +} + +static int kv_store_find_next_slot(struct kv_store *kvs, int size, struct kv_store_meta *meta) +{ + struct kv_store_meta metas[KVS_MAX_VAL_ENTRIES]; + int i, err, off, next_off = 0; + struct kv_store_page *p; + + memcpy(metas, kvs->data.metas, sizeof(struct kv_store_meta) * KVS_MAX_VAL_ENTRIES); + + qsort(metas, KVS_MAX_VAL_ENTRIES, sizeof(struct kv_store_meta), comp_meta); + + for (i = 0; i < KVS_MAX_VAL_ENTRIES; i++) { + off = linear_off(&metas[i]); + if (off - next_off >= size && + next_off / PAGE_SIZE == (next_off + size - 1) / PAGE_SIZE) { + break; + } + next_off = off + metas[i].size; + } + + meta->page_idx = next_off / PAGE_SIZE; + meta->page_off = next_off % PAGE_SIZE; + meta->size = size; + + if (meta->page_idx >= kvs->page_cnt) { + p = __kv_store_add_page(kvs); + if (!p) + return -ENOMEM; + + err = bpf_map_update_elem(kvs->data_map_fd, &kvs->task_fd, &kvs->data, 0); + if (err) { + __kv_store_del_page(kvs); + return err; + } + } + + return 0; +} + +int kv_store_put(struct kv_store *kvs, int key, void *val, unsigned int val_size) +{ + struct kv_store_meta *meta; + struct kv_store_page *p; + int err; + + meta = kvs_store_get_meta(kvs, key); + if (!meta) + return -ENOENT; + + if (!meta->init) { + if (val_size > KVS_MAX_VAL_SIZE) + return -E2BIG; + + err = kv_store_find_next_slot(kvs, val_size, meta); + if (err) + return err; + } + + p = kvs->data.pages[meta->page_idx].page; + val_size = val_size < meta->size ? val_size : meta->size; + memcpy((char *)p->data + meta->page_off, val, val_size); + meta->init = 1; + return 0; +} + +void kv_store_delete(struct kv_store *kvs, int key) +{ + struct kv_store_meta *meta; + struct kv_store_page *p; + + meta = kvs_store_get_meta(kvs, key); + if (!meta) + return; + + p = kvs->data.pages[meta->page_idx].page; + memset(p->data + meta->page_off, 0, meta->size); + memset(meta, 0, sizeof(*meta)); +} + +int kv_store_update_value_size(struct kv_store *kvs, int key, unsigned int val_size) +{ + struct kv_store_meta *meta, new_meta; + struct kv_store_page *old_p, *new_p; + int err; + + if (val_size > KVS_MAX_VAL_SIZE) + return -E2BIG; + + meta = kvs_store_get_meta(kvs, key); + if (!meta || !meta->init) + return -ENOENT; + + if (val_size <= meta->size) { + meta->size = val_size; + return 0; + } + + err = kv_store_find_next_slot(kvs, val_size, &new_meta); + if (err) + return -ENOSPC; + + old_p = kvs->data.pages[meta->page_idx].page; + new_p = kvs->data.pages[new_meta.page_idx].page; + + memcpy(new_p->data + new_meta.page_off, + old_p->data + meta->page_off, meta->size); + + return 0; +} diff --git a/tools/testing/selftests/bpf/prog_tests/uptr_kv_store.h b/tools/testing/selftests/bpf/prog_tests/uptr_kv_store.h new file mode 100644 index 000000000000..a1da3e6e2de3 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/uptr_kv_store.h @@ -0,0 +1,22 @@ +#ifndef _UPTR_KV_STORE_H +#define _UPTR_KV_STORE_H + +#include "uptr_kv_store_common.h" + +struct kv_store; + +void kv_store_close(struct kv_store *kvs); + +struct kv_store *kv_store_init(int pid, struct bpf_map *data_map, const char *pin_path); + +int kv_store_data_map_set_reuse(struct kv_store *kvs, struct bpf_map *data_map); + +void *kv_store_get(struct kv_store *kvs, int key); + +int kv_store_put(struct kv_store *kvs, int key, void *val, unsigned int val_size); + +void kv_store_delete(struct kv_store *kvs, int key); + +int kv_store_update_value_size(struct kv_store *kvs, int key, unsigned int val_size); + +#endif diff --git a/tools/testing/selftests/bpf/progs/uptr_kv_store.h b/tools/testing/selftests/bpf/progs/uptr_kv_store.h new file mode 100644 index 000000000000..9109073a4933 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/uptr_kv_store.h @@ -0,0 +1,120 @@ +#ifndef _UPTR_KV_STORE_H +#define _UPTR_KV_STORE_H + +#include <errno.h> +#include <string.h> +#include <bpf/bpf_helpers.h> + +#include "uptr_kv_store_common.h" + +struct { + __uint(type, BPF_MAP_TYPE_TASK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct kv_store_data_map_value); +} data_map SEC(".maps"); + +static int bpf_dynptr_from_kv_store(struct kv_store_data_map_value *data, int key, + unsigned int val_size, struct bpf_dynptr *ptr, + struct kv_store_meta **meta) +{ + struct kv_store_page *p = NULL; + u16 _key = 0; + + if (!data || !data->metas) + return -ENOENT; + + /* workaround. llvm generates memory access with unbound key with the following code: + * if (key >= KVS_MAX_VAL_ENTRIES) + * return -ENOENT; + * + * ; *meta = &data->metas->meta[key]; @ uptr_kv_store.h:37 + * 62: (bc) w2 = w2 ; frame1: R2_w=scalar(id=3,smin=0,smax=umax=0xffffffff,smax32=1023,var_off=(0x0; 0xffffffff)) + * 63: (67) r2 <<= 32 ; frame1: R2_w=scalar(smax=0x3ff00000000,umax=0xffffffff00000000,smin32=0,smax32=umax32=0,var_off=(0x0; 0xffffffff00000000)) + * 64: (c7) r2 s>>= 32 ; frame1: R2_w=scalar(smin=0xffffffff80000000,smax=smax32=1023) + * 65: (67) r2 <<= 2 ; frame1: R2_w=scalar(smax=0x7ffffffffffffffc,umax=0xfffffffffffffffc,smax32=0x7ffffffc,umax32=0xfffffffc,var_off=(0x0; 0xfffffffffffffffc)) + * 66: (0f) r6 += r2 + * math between mem pointer and register with unbounded min value is not allowed + */ + _key += key; + if (_key >= KVS_MAX_VAL_ENTRIES) + return -ENOENT; + + *meta = &data->metas->meta[_key]; + if (!(*meta)->init) + return -ENOENT; + + /* workaround for variable offset uptr access: + * p = data->pages[meta->page_idx].page; + */ + switch((*meta)->page_idx) { + case 0: p = data->pages[0].page; break; + case 1: p = data->pages[1].page; break; + case 2: p = data->pages[2].page; break; + case 3: p = data->pages[3].page; break; + case 4: p = data->pages[4].page; break; + case 5: p = data->pages[5].page; break; + case 6: p = data->pages[6].page; break; + case 7: p = data->pages[7].page; break; + } + + if (!p) + return -ENOENT; + + val_size = val_size < (*meta)->size ? val_size : (*meta)->size; + + if ((*meta)->page_off >= KVS_MAX_VAL_SIZE) + return -EINVAL; + + return bpf_dynptr_from_mem(p->data, KVS_MAX_VAL_SIZE, 0, ptr); +} + +__attribute__((unused)) +static int kv_store_put(struct kv_store_data_map_value *data, int key, + void *val, unsigned int val_size) +{ + struct kv_store_meta *meta; + struct bpf_dynptr ptr; + int err; + + err = bpf_dynptr_from_kv_store(data, key, val_size, &ptr, &meta); + if (err) + return err; + + return bpf_dynptr_write(&ptr, meta->page_off, val, val_size, 0); +} + +__attribute__((unused)) +static int kv_store_get(struct kv_store_data_map_value *data, int key, + void *val, unsigned int val_size) +{ + struct kv_store_meta *meta; + struct bpf_dynptr ptr; + int err; + + err = bpf_dynptr_from_kv_store(data, key, val_size, &ptr, &meta); + if (err) + return err; + + return bpf_dynptr_read(val, val_size, &ptr, meta->page_off, 0); +} + +__attribute__((unused)) +static int kv_store_delete(struct kv_store_data_map_value *data, int key) +{ + struct kv_store_meta *meta; + u16 _key = 0; + + if (!data || !data->metas) + return -ENOENT; + + _key += key; + if (_key >= KVS_MAX_VAL_ENTRIES) + return -ENOENT; + + meta = &data->metas->meta[_key]; + meta->init = 0; + return 0; +} + +#endif diff --git a/tools/testing/selftests/bpf/uptr_kv_store_common.h b/tools/testing/selftests/bpf/uptr_kv_store_common.h new file mode 100644 index 000000000000..af69cd0b32da --- /dev/null +++ b/tools/testing/selftests/bpf/uptr_kv_store_common.h @@ -0,0 +1,47 @@ +#ifndef _UPTR_KV_STORE_COMMON_H +#define _UPTR_KV_STORE_COMMON_H + +#define PAGE_SIZE 4096 +#define KVS_MAX_KEY_SIZE 32 +#define KVS_MAX_VAL_SIZE PAGE_SIZE +#define KVS_MAX_VAL_ENTRIES 1024 + +#define KVS_VALUE_INFO_PAGE_IDX_BIT 3 +#define KVS_VALUE_INFO_PAGE_OFF_BIT 12 +#define KVS_VALUE_INFO_VAL_SIZE_BIT 12 + +#define KVS_MAX_PAGE_ENTRIES (1 << KVS_VALUE_INFO_PAGE_IDX_BIT) + +#ifdef __BPF__ +struct kv_store_page *dummy_page; +struct kv_store_metas *dummy_metas; +#else +#define __uptr +#define __kptr +#endif + +struct kv_store_meta { + __u32 page_idx:KVS_VALUE_INFO_PAGE_IDX_BIT; + __u32 page_off:KVS_VALUE_INFO_PAGE_OFF_BIT; + __u32 size:KVS_VALUE_INFO_VAL_SIZE_BIT; + __u32 init:1; +}; + +struct kv_store_metas { + struct kv_store_meta meta[KVS_MAX_VAL_ENTRIES]; +}; + +struct kv_store_page_entry { + struct kv_store_page __uptr *page; +}; + +struct kv_store_data_map_value { + struct kv_store_metas __uptr *metas; + struct kv_store_page_entry pages[KVS_MAX_PAGE_ENTRIES]; +}; + +struct kv_store_page { + char data[KVS_MAX_VAL_SIZE]; +}; + +#endif -- 2.47.1