On Fri, Feb 17, 2023 at 04:10:04PM -0800, Andrii Nakryiko wrote: > On Fri, Feb 17, 2023 at 11:19 AM Daniel Müller <deso@xxxxxxxxxx> wrote: > > > > This change implements support for reading zip archives, including > > opening an archive, finding an entry based on its path and name in it, > > and closing it. > > The code was copied from https://github.com/iovisor/bcc/pull/4440, which > > implements similar functionality for bcc. The author confirmed that he > > is fine with this usage and the corresponding relicensing. I adjusted it > > to adhere to libbpf coding standards. > > Let's record this with > > Acked-by: Michał Gregorczyk <michalgr@xxxxxxxx> > > tag added next to your Signed-off-by. I've added Michal to cc for visibility. > > > > > Signed-off-by: Daniel Müller <deso@xxxxxxxxxx> > > --- > > tools/lib/bpf/Build | 2 +- > > tools/lib/bpf/zip.c | 378 ++++++++++++++++++++++++++++++++++++++++++++ > > tools/lib/bpf/zip.h | 47 ++++++ > > 3 files changed, 426 insertions(+), 1 deletion(-) > > create mode 100644 tools/lib/bpf/zip.c > > create mode 100644 tools/lib/bpf/zip.h > > > > diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build > > index 5a3dfb..b8b0a63 100644 > > --- a/tools/lib/bpf/Build > > +++ b/tools/lib/bpf/Build > > @@ -1,4 +1,4 @@ > > libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \ > > netlink.o bpf_prog_linfo.o libbpf_probes.o hashmap.o \ > > btf_dump.o ringbuf.o strset.o linker.o gen_loader.o relo_core.o \ > > - usdt.o > > + usdt.o zip.o > > diff --git a/tools/lib/bpf/zip.c b/tools/lib/bpf/zip.c > > new file mode 100644 > > index 0000000..59ec79 > > --- /dev/null > > +++ b/tools/lib/bpf/zip.c > > @@ -0,0 +1,378 @@ > > +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) > > +/* > > + * Routines for dealing with .zip archives. > > + * > > + * Copyright (c) Meta Platforms, Inc. and affiliates. > > + */ > > + > > +#include <fcntl.h> > > +#include <stdint.h> > > +#include <stdlib.h> > > +#include <string.h> > > +#include <sys/mman.h> > > +#include <unistd.h> > > + > > +#include "zip.h" > > + > > +/* Specification of ZIP file format can be found here: > > + * https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT > > + * For a high level overview of the structure of a ZIP file see > > + * sections 4.3.1 - 4.3.6. > > + * > > + * Data structures appearing in ZIP files do not contain any > > + * padding and they might be misaligned. To allow us to safely > > + * operate on pointers to such structures and their members, without > > + * worrying of platform specific alignment issues, we define > > + * unaligned_uint16_t and unaligned_uint32_t types with no alignment > > + * requirements. > > + */ > > +typedef struct { > > + uint8_t raw[2]; > > +} unaligned_uint16_t; > > + > > +static uint16_t unaligned_uint16_read(unaligned_uint16_t value) > > +{ > > + uint16_t return_value; > > + > > + memcpy(&return_value, value.raw, sizeof(return_value)); > > + return return_value; > > +} > > + > > +typedef struct { > > + uint8_t raw[4]; > > +} unaligned_uint32_t; > > + > > +static uint32_t unaligned_uint32_read(unaligned_uint32_t value) > > +{ > > + uint32_t return_value; > > + > > + memcpy(&return_value, value.raw, sizeof(return_value)); > > + return return_value; > > +} > > it feels like an overkill to use this unaligned_xxx_t approach. Let's > use __attribute__((packed)) on corresponding structs and just > uint16_t/uint32_t field types. > > > + > > +#define END_OF_CD_RECORD_MAGIC 0x06054b50 > > + > > +/* See section 4.3.16 of the spec. */ > > +struct end_of_central_directory_record { > > + /* Magic value equal to END_OF_CD_RECORD_MAGIC */ > > + unaligned_uint32_t magic; > > + > > + /* Number of the file containing this structure or 0xFFFF if ZIP64 archive. > > + * Zip archive might span multiple files (disks). > > + */ > > + unaligned_uint16_t this_disk; > > + > > + /* Number of the file containing the beginning of the central directory or > > + * 0xFFFF if ZIP64 archive. > > + */ > > + unaligned_uint16_t cd_disk; > > + > > + /* Number of central directory records on this disk or 0xFFFF if ZIP64 > > + * archive. > > + */ > > + unaligned_uint16_t cd_records; > > + > > + /* Number of central directory records on all disks or 0xFFFF if ZIP64 > > + * archive. > > + */ > > + unaligned_uint16_t cd_records_total; > > + > > + /* Size of the central directory recrod or 0xFFFFFFFF if ZIP64 archive. */ > > typo: record > > > + unaligned_uint32_t cd_size; > > + > > + /* Offset of the central directory from the beginning of the archive or > > + * 0xFFFFFFFF if ZIP64 archive. > > + */ > > + unaligned_uint32_t cd_offset; > > + > > + /* Length of comment data following end of central driectory record. */ > > typo: directory > > > > + unaligned_uint16_t comment_length; > > + > > + /* Up to 64k of arbitrary bytes. */ > > + /* uint8_t comment[comment_length] */ > > +}; > > + > > +#define CD_FILE_HEADER_MAGIC 0x02014b50 > > +#define FLAG_ENCRYPTED (1 << 0) > > +#define FLAG_HAS_DATA_DESCRIPTOR (1 << 3) > > + > > +/* See section 4.3.12 of the spec. */ > > +struct central_directory_file_header { > > > naming nit here and below: we use CD in constants, but spell out > "central_directory", which feels a bit mouthful. Let's use "cd" > consistently throughout (even if it's a bit confusing with "change > directory", but I think in the context of parsing ZIP contents should > be recognizable enough) Sure. > > + /* Magic value equal to CD_FILE_HEADER_MAGIC. */ > > + unaligned_uint32_t magic; > > + unaligned_uint16_t version; > > + /* Minimum zip version needed to extract the file. */ > > + unaligned_uint16_t min_version; > > + unaligned_uint16_t flags; > > + unaligned_uint16_t compression; > > + unaligned_uint16_t last_modified_time; > > + unaligned_uint16_t last_modified_date; > > + unaligned_uint32_t crc; > > + unaligned_uint32_t compressed_size; > > + unaligned_uint32_t uncompressed_size; > > + unaligned_uint16_t file_name_length; > > + unaligned_uint16_t extra_field_length; > > + unaligned_uint16_t file_comment_length; > > + /* Number of the disk where the file starts or 0xFFFF if ZIP64 archive. */ > > + unaligned_uint16_t disk; > > + unaligned_uint16_t internal_attributes; > > + unaligned_uint32_t external_attributes; > > + /* Offset from the start of the disk containing the local file header to the > > + * start of the local file header. > > + */ > > + unaligned_uint32_t offset; > > +}; > > + > > +#define LOCAL_FILE_HEADER_MAGIC 0x04034b50 > > + > > +/* See section 4.3.7 of the spec. */ > > +struct local_file_header { > > + /* Magic value equal to LOCAL_FILE_HEADER_MAGIC. */ > > + unaligned_uint32_t magic; > > + /* Minimum zip version needed to extract the file. */ > > + unaligned_uint16_t min_version; > > + unaligned_uint16_t flags; > > + unaligned_uint16_t compression; > > + unaligned_uint16_t last_modified_time; > > + unaligned_uint16_t last_modified_date; > > + unaligned_uint32_t crc; > > + unaligned_uint32_t compressed_size; > > + unaligned_uint32_t uncompressed_size; > > + unaligned_uint16_t file_name_length; > > + unaligned_uint16_t extra_field_length; > > +}; > > + > > +struct zip_archive { > > + void *data; > > + uint32_t size; > > + uint32_t cd_offset; > > + uint32_t cd_records; > > +}; > > + > > $ rg -w 'uint\d\d_t' | wc -l > 21 > $ rg -w '__u\d\d' | wc -l > 873 > > seems like we overwhelmingly prefer __u32/__u16 in libbpf code base, > let's use those instead? Sure. > > +static void *check_access(struct zip_archive *archive, uint32_t offset, uint32_t size) > > +{ > > + if (offset + size > archive->size || offset > offset + size) { > > + return NULL; > > + } > > + return archive->data + offset; > > +} > > + > > +/* Returns 0 on success, -1 on error and -2 if the eocd indicates > > let's use -EINVAL and -ENOTSUP instead of -1 and -2 Sure. > > + * the archive uses features which are not supported. > > + */ > > +static int try_parse_end_of_central_directory(struct zip_archive *archive, uint32_t offset) > > +{ > > + struct end_of_central_directory_record *eocd = > > + check_access(archive, offset, sizeof(struct end_of_central_directory_record)); > > + uint16_t comment_length, cd_records; > > + uint32_t cd_offset, cd_size; > > + > > + if (!eocd || unaligned_uint32_read(eocd->magic) != END_OF_CD_RECORD_MAGIC) { > > + return -1; > > + } > > + > > + comment_length = unaligned_uint16_read(eocd->comment_length); > > + if (offset + sizeof(struct end_of_central_directory_record) + comment_length != > > + archive->size) { > > + return -1; > > + } > > + > > + cd_records = unaligned_uint16_read(eocd->cd_records); > > + if (unaligned_uint16_read(eocd->this_disk) != 0 || > > + unaligned_uint16_read(eocd->cd_disk) != 0 || > > + unaligned_uint16_read(eocd->cd_records_total) != cd_records) { > > + /* This is a valid eocd, but we only support single-file non-ZIP64 archives. */ > > + return -2; > > + } > > + > > + cd_offset = unaligned_uint32_read(eocd->cd_offset); > > + cd_size = unaligned_uint32_read(eocd->cd_size); > > + if (!check_access(archive, cd_offset, cd_size)) { > > + return -1; > > + } > > + > > + archive->cd_offset = cd_offset; > > + archive->cd_records = cd_records; > > + return 0; > > +} > > + > > +static int find_central_directory(struct zip_archive *archive) > > +{ > > + uint32_t offset; > > + int64_t limit; > > + int rc = -1; > > + > > + if (archive->size <= sizeof(struct end_of_central_directory_record)) { > > + return -1; > > + } > > + > > + /* Because the end of central directory ends with a variable length array of > > + * up to 0xFFFF bytes we can't know exactly where it starts and need to > > + * search for it at the end of the file, scanning the (limit, offset] range. > > + */ > > + offset = archive->size - sizeof(struct end_of_central_directory_record); > > + limit = (int64_t)offset - (1 << 16); > > + > > + for (; offset >= 0 && offset > limit && rc == -1; offset--) { > > + rc = try_parse_end_of_central_directory(archive, offset); > > + } > > + > > + return rc; > > +} > > + > > +struct zip_archive *zip_archive_open(const char *path) > > +{ > > + struct zip_archive *archive; > > + int fd = open(path, O_RDONLY); > > let's not do complicated operations during variable initialization, > let's just init fd right before checking it for <0 > > > + off_t size; > > + void *data; > > + > > + if (fd < 0) { > > + return NULL; > > + } > > no {} for single-line if statement > > > + > > + size = lseek(fd, 0, SEEK_END); > > + if (size == (off_t)-1 || size > UINT32_MAX) { > > + close(fd); > > + return NULL; > > + } > > + > > + data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0); > > + close(fd); > > + > > + if (data == MAP_FAILED) { > > + return NULL; > > + } > > ditto about {} > > > + > > + archive = malloc(sizeof(struct zip_archive)); > > + if (!archive) { > > + munmap(data, size); > > + return NULL; > > + }; > > + > > + archive->data = data; > > + archive->size = size; > > + if (find_central_directory(archive)) { > > + munmap(data, size); > > + free(archive); > > + archive = NULL; > > return NULL; ? > > > + } > > + > > + return archive; > > +} > > + > > +void zip_archive_close(struct zip_archive *archive) > > +{ > > + munmap(archive->data, archive->size); > > + free(archive); > > +} > > + > > +static struct local_file_header *local_file_header_at_offset(struct zip_archive *archive, > > + uint32_t offset) > > is there a non-local file in ZIP archive? Seems a bit too verbose, > just "file_header_at_offset"? and seeing "get_entry_at_offset" below, > should it be "get_file_header_at_offset" then? It's called that by the standard. See section 4.3.7 Local file header in https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT > > +{ > > + struct local_file_header *lfh = > > + check_access(archive, offset, sizeof(struct local_file_header)); > > empty line between variables and code > > > + if (!lfh || unaligned_uint32_read(lfh->magic) != LOCAL_FILE_HEADER_MAGIC) { > > + return NULL; > > + } > > here and below, same {} issue > > > + return lfh; > > +} > > + > > +static int get_entry_at_offset(struct zip_archive *archive, uint32_t offset, struct zip_entry *out) > > +{ > > + struct local_file_header *lfh = local_file_header_at_offset(archive, offset); > > same as above, let's not do extensive operations during variable > initialization, please split > > > + uint16_t flags, name_length, extra_field_length; > > + uint32_t compressed_size; > > + const char *name; > > + void *data; > > + > > + offset += sizeof(struct local_file_header); > > logically this probably should happen after we checked lfh for NULL? Sure. > > + if (!lfh) { > > + return -1; > > here and everywhere, let's use kernel error code constants, e.g., -EINVAL Sure. > > + }; > > + > > + flags = unaligned_uint16_read(lfh->flags); > > + if ((flags & FLAG_ENCRYPTED) || (flags & FLAG_HAS_DATA_DESCRIPTOR)) { > > + return -1; > > + } > > + > > + name_length = unaligned_uint16_read(lfh->file_name_length); > > + name = check_access(archive, offset, name_length); > > + offset += name_length; > > + if (!name) { > > + return -1; > > + } > > + > > + extra_field_length = unaligned_uint16_read(lfh->extra_field_length); > > + if (!check_access(archive, offset, extra_field_length)) { > > + return -1; > > + } > > + offset += extra_field_length; > > + > > + compressed_size = unaligned_uint32_read(lfh->compressed_size); > > why not fill out out->data_length instead of all these local variables? Because the function contract is that 'out' is left untouched on error. > > + data = check_access(archive, offset, compressed_size); > > + if (!data) { > > + return -1; > > + } > > + > > + out->compression = unaligned_uint16_read(lfh->compression); > > + out->name_length = name_length; > > + out->name = name; > > + out->data = data; > > + out->data_length = compressed_size; > > + out->data_offset = offset; > > + > > + return 0; > > +} > > + > > +static struct central_directory_file_header *cd_file_header_at_offset(struct zip_archive *archive, > > + uint32_t offset) > > this function is called just once below, why a helper? Will inline it. > > +{ > > + struct central_directory_file_header *cdfh = > > + check_access(archive, offset, sizeof(struct central_directory_file_header)); > > empty line > > > + if (!cdfh || unaligned_uint32_read(cdfh->magic) != CD_FILE_HEADER_MAGIC) { > > + return NULL; > > + } > > + return cdfh; > > +} > > + > > +int zip_archive_find_entry(struct zip_archive *archive, const char *file_name, > > + struct zip_entry *out) > > +{ > > + size_t file_name_length = strlen(file_name); > > + > > no need for empty line, let's keep variable declarations in one block > > > + uint32_t i, offset = archive->cd_offset; > > + > > + for (i = 0; i < archive->cd_records; ++i) { > > + struct central_directory_file_header *cdfh = > > + cd_file_header_at_offset(archive, offset); > > + uint16_t cdfh_name_length, cdfh_flags; > > + const char *cdfh_name; > > + > > + offset += sizeof(struct central_directory_file_header); > > same, logically offset should be updated after we made sure we have > cd_file_header > > > + if (!cdfh) { > > + return -1; > > + } > > + > > + cdfh_name_length = unaligned_uint16_read(cdfh->file_name_length); > > + cdfh_name = check_access(archive, offset, cdfh_name_length); > > + if (!cdfh_name) { > > + return -1; > > + } > > + > > + cdfh_flags = unaligned_uint16_read(cdfh->flags); > > + if ((cdfh_flags & FLAG_ENCRYPTED) == 0 && > > + (cdfh_flags & FLAG_HAS_DATA_DESCRIPTOR) == 0 && > > + file_name_length == cdfh_name_length && > > + memcmp(file_name, archive->data + offset, file_name_length) == 0) { > > + return get_entry_at_offset(archive, unaligned_uint32_read(cdfh->offset), > > + out); > > + } > > + > > + offset += cdfh_name_length; > > + offset += unaligned_uint16_read(cdfh->extra_field_length); > > + offset += unaligned_uint16_read(cdfh->file_comment_length); > > + } > > + > > + return -1; > > +} > > diff --git a/tools/lib/bpf/zip.h b/tools/lib/bpf/zip.h > > new file mode 100644 > > index 0000000..a9083f > > --- /dev/null > > +++ b/tools/lib/bpf/zip.h > > @@ -0,0 +1,47 @@ > > +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ > > + > > +#ifndef __LIBBPF_ZIP_H > > +#define __LIBBPF_ZIP_H > > + > > +#include <stdint.h> > > + > > +/* Represents an opened zip archive. > > s/opened/open/ > > > > + * Only basic ZIP files are supported, in particular the following are not > > + * supported: > > + * - encryption > > + * - streaming > > + * - multi-part ZIP files > > + * - ZIP64 > > + */ > > +struct zip_archive; > > + > > +/* Carries information on name, compression method, and data corresponding to a > > + * file in a zip archive. > > + */ > > +struct zip_entry { > > + /* Compression method as defined in pkzip spec. 0 means data is uncompressed. */ > > + uint16_t compression; > > + > > + /* Non-null terminated name of the file. */ > > + const char *name; > > + /* Length of the file name. */ > > + uint16_t name_length; > > + > > + /* Pointer to the file data. */ > > + const void *data; > > + /* Length of the file data. */ > > + uint32_t data_length; > > + /* Offset of the file data within the archive. */ > > + uint32_t data_offset; > > +}; > > + > > +/* Open a zip archive. Returns NULL in case of an error. */ > > +struct zip_archive *zip_archive_open(const char *path); > > + > > +/* Close a zip archive and release resources. */ > > +void zip_archive_close(struct zip_archive *archive); > > + > > +/* Look up an entry corresponding to a file in given zip archive. */ > > +int zip_archive_find_entry(struct zip_archive *archive, const char *name, struct zip_entry *out); > > + > > +#endif > > -- > > 2.30.2 > >