On Fri, Feb 17, 2023 at 11:19 AM Daniel Müller <deso@xxxxxxxxxx> wrote: > > This change implements support for reading zip archives, including > opening an archive, finding an entry based on its path and name in it, > and closing it. > The code was copied from https://github.com/iovisor/bcc/pull/4440, which > implements similar functionality for bcc. The author confirmed that he > is fine with this usage and the corresponding relicensing. I adjusted it > to adhere to libbpf coding standards. Let's record this with Acked-by: Michał Gregorczyk <michalgr@xxxxxxxx> tag added next to your Signed-off-by. I've added Michal to cc for visibility. > > Signed-off-by: Daniel Müller <deso@xxxxxxxxxx> > --- > tools/lib/bpf/Build | 2 +- > tools/lib/bpf/zip.c | 378 ++++++++++++++++++++++++++++++++++++++++++++ > tools/lib/bpf/zip.h | 47 ++++++ > 3 files changed, 426 insertions(+), 1 deletion(-) > create mode 100644 tools/lib/bpf/zip.c > create mode 100644 tools/lib/bpf/zip.h > > diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build > index 5a3dfb..b8b0a63 100644 > --- a/tools/lib/bpf/Build > +++ b/tools/lib/bpf/Build > @@ -1,4 +1,4 @@ > libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \ > netlink.o bpf_prog_linfo.o libbpf_probes.o hashmap.o \ > btf_dump.o ringbuf.o strset.o linker.o gen_loader.o relo_core.o \ > - usdt.o > + usdt.o zip.o > diff --git a/tools/lib/bpf/zip.c b/tools/lib/bpf/zip.c > new file mode 100644 > index 0000000..59ec79 > --- /dev/null > +++ b/tools/lib/bpf/zip.c > @@ -0,0 +1,378 @@ > +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) > +/* > + * Routines for dealing with .zip archives. > + * > + * Copyright (c) Meta Platforms, Inc. and affiliates. > + */ > + > +#include <fcntl.h> > +#include <stdint.h> > +#include <stdlib.h> > +#include <string.h> > +#include <sys/mman.h> > +#include <unistd.h> > + > +#include "zip.h" > + > +/* Specification of ZIP file format can be found here: > + * https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT > + * For a high level overview of the structure of a ZIP file see > + * sections 4.3.1 - 4.3.6. > + * > + * Data structures appearing in ZIP files do not contain any > + * padding and they might be misaligned. To allow us to safely > + * operate on pointers to such structures and their members, without > + * worrying of platform specific alignment issues, we define > + * unaligned_uint16_t and unaligned_uint32_t types with no alignment > + * requirements. > + */ > +typedef struct { > + uint8_t raw[2]; > +} unaligned_uint16_t; > + > +static uint16_t unaligned_uint16_read(unaligned_uint16_t value) > +{ > + uint16_t return_value; > + > + memcpy(&return_value, value.raw, sizeof(return_value)); > + return return_value; > +} > + > +typedef struct { > + uint8_t raw[4]; > +} unaligned_uint32_t; > + > +static uint32_t unaligned_uint32_read(unaligned_uint32_t value) > +{ > + uint32_t return_value; > + > + memcpy(&return_value, value.raw, sizeof(return_value)); > + return return_value; > +} it feels like an overkill to use this unaligned_xxx_t approach. Let's use __attribute__((packed)) on corresponding structs and just uint16_t/uint32_t field types. > + > +#define END_OF_CD_RECORD_MAGIC 0x06054b50 > + > +/* See section 4.3.16 of the spec. */ > +struct end_of_central_directory_record { > + /* Magic value equal to END_OF_CD_RECORD_MAGIC */ > + unaligned_uint32_t magic; > + > + /* Number of the file containing this structure or 0xFFFF if ZIP64 archive. > + * Zip archive might span multiple files (disks). > + */ > + unaligned_uint16_t this_disk; > + > + /* Number of the file containing the beginning of the central directory or > + * 0xFFFF if ZIP64 archive. > + */ > + unaligned_uint16_t cd_disk; > + > + /* Number of central directory records on this disk or 0xFFFF if ZIP64 > + * archive. > + */ > + unaligned_uint16_t cd_records; > + > + /* Number of central directory records on all disks or 0xFFFF if ZIP64 > + * archive. > + */ > + unaligned_uint16_t cd_records_total; > + > + /* Size of the central directory recrod or 0xFFFFFFFF if ZIP64 archive. */ typo: record > + unaligned_uint32_t cd_size; > + > + /* Offset of the central directory from the beginning of the archive or > + * 0xFFFFFFFF if ZIP64 archive. > + */ > + unaligned_uint32_t cd_offset; > + > + /* Length of comment data following end of central driectory record. */ typo: directory > + unaligned_uint16_t comment_length; > + > + /* Up to 64k of arbitrary bytes. */ > + /* uint8_t comment[comment_length] */ > +}; > + > +#define CD_FILE_HEADER_MAGIC 0x02014b50 > +#define FLAG_ENCRYPTED (1 << 0) > +#define FLAG_HAS_DATA_DESCRIPTOR (1 << 3) > + > +/* See section 4.3.12 of the spec. */ > +struct central_directory_file_header { naming nit here and below: we use CD in constants, but spell out "central_directory", which feels a bit mouthful. Let's use "cd" consistently throughout (even if it's a bit confusing with "change directory", but I think in the context of parsing ZIP contents should be recognizable enough) > + /* Magic value equal to CD_FILE_HEADER_MAGIC. */ > + unaligned_uint32_t magic; > + unaligned_uint16_t version; > + /* Minimum zip version needed to extract the file. */ > + unaligned_uint16_t min_version; > + unaligned_uint16_t flags; > + unaligned_uint16_t compression; > + unaligned_uint16_t last_modified_time; > + unaligned_uint16_t last_modified_date; > + unaligned_uint32_t crc; > + unaligned_uint32_t compressed_size; > + unaligned_uint32_t uncompressed_size; > + unaligned_uint16_t file_name_length; > + unaligned_uint16_t extra_field_length; > + unaligned_uint16_t file_comment_length; > + /* Number of the disk where the file starts or 0xFFFF if ZIP64 archive. */ > + unaligned_uint16_t disk; > + unaligned_uint16_t internal_attributes; > + unaligned_uint32_t external_attributes; > + /* Offset from the start of the disk containing the local file header to the > + * start of the local file header. > + */ > + unaligned_uint32_t offset; > +}; > + > +#define LOCAL_FILE_HEADER_MAGIC 0x04034b50 > + > +/* See section 4.3.7 of the spec. */ > +struct local_file_header { > + /* Magic value equal to LOCAL_FILE_HEADER_MAGIC. */ > + unaligned_uint32_t magic; > + /* Minimum zip version needed to extract the file. */ > + unaligned_uint16_t min_version; > + unaligned_uint16_t flags; > + unaligned_uint16_t compression; > + unaligned_uint16_t last_modified_time; > + unaligned_uint16_t last_modified_date; > + unaligned_uint32_t crc; > + unaligned_uint32_t compressed_size; > + unaligned_uint32_t uncompressed_size; > + unaligned_uint16_t file_name_length; > + unaligned_uint16_t extra_field_length; > +}; > + > +struct zip_archive { > + void *data; > + uint32_t size; > + uint32_t cd_offset; > + uint32_t cd_records; > +}; > + $ rg -w 'uint\d\d_t' | wc -l 21 $ rg -w '__u\d\d' | wc -l 873 seems like we overwhelmingly prefer __u32/__u16 in libbpf code base, let's use those instead? > +static void *check_access(struct zip_archive *archive, uint32_t offset, uint32_t size) > +{ > + if (offset + size > archive->size || offset > offset + size) { > + return NULL; > + } > + return archive->data + offset; > +} > + > +/* Returns 0 on success, -1 on error and -2 if the eocd indicates let's use -EINVAL and -ENOTSUP instead of -1 and -2 > + * the archive uses features which are not supported. > + */ > +static int try_parse_end_of_central_directory(struct zip_archive *archive, uint32_t offset) > +{ > + struct end_of_central_directory_record *eocd = > + check_access(archive, offset, sizeof(struct end_of_central_directory_record)); > + uint16_t comment_length, cd_records; > + uint32_t cd_offset, cd_size; > + > + if (!eocd || unaligned_uint32_read(eocd->magic) != END_OF_CD_RECORD_MAGIC) { > + return -1; > + } > + > + comment_length = unaligned_uint16_read(eocd->comment_length); > + if (offset + sizeof(struct end_of_central_directory_record) + comment_length != > + archive->size) { > + return -1; > + } > + > + cd_records = unaligned_uint16_read(eocd->cd_records); > + if (unaligned_uint16_read(eocd->this_disk) != 0 || > + unaligned_uint16_read(eocd->cd_disk) != 0 || > + unaligned_uint16_read(eocd->cd_records_total) != cd_records) { > + /* This is a valid eocd, but we only support single-file non-ZIP64 archives. */ > + return -2; > + } > + > + cd_offset = unaligned_uint32_read(eocd->cd_offset); > + cd_size = unaligned_uint32_read(eocd->cd_size); > + if (!check_access(archive, cd_offset, cd_size)) { > + return -1; > + } > + > + archive->cd_offset = cd_offset; > + archive->cd_records = cd_records; > + return 0; > +} > + > +static int find_central_directory(struct zip_archive *archive) > +{ > + uint32_t offset; > + int64_t limit; > + int rc = -1; > + > + if (archive->size <= sizeof(struct end_of_central_directory_record)) { > + return -1; > + } > + > + /* Because the end of central directory ends with a variable length array of > + * up to 0xFFFF bytes we can't know exactly where it starts and need to > + * search for it at the end of the file, scanning the (limit, offset] range. > + */ > + offset = archive->size - sizeof(struct end_of_central_directory_record); > + limit = (int64_t)offset - (1 << 16); > + > + for (; offset >= 0 && offset > limit && rc == -1; offset--) { > + rc = try_parse_end_of_central_directory(archive, offset); > + } > + > + return rc; > +} > + > +struct zip_archive *zip_archive_open(const char *path) > +{ > + struct zip_archive *archive; > + int fd = open(path, O_RDONLY); let's not do complicated operations during variable initialization, let's just init fd right before checking it for <0 > + off_t size; > + void *data; > + > + if (fd < 0) { > + return NULL; > + } no {} for single-line if statement > + > + size = lseek(fd, 0, SEEK_END); > + if (size == (off_t)-1 || size > UINT32_MAX) { > + close(fd); > + return NULL; > + } > + > + data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0); > + close(fd); > + > + if (data == MAP_FAILED) { > + return NULL; > + } ditto about {} > + > + archive = malloc(sizeof(struct zip_archive)); > + if (!archive) { > + munmap(data, size); > + return NULL; > + }; > + > + archive->data = data; > + archive->size = size; > + if (find_central_directory(archive)) { > + munmap(data, size); > + free(archive); > + archive = NULL; return NULL; ? > + } > + > + return archive; > +} > + > +void zip_archive_close(struct zip_archive *archive) > +{ > + munmap(archive->data, archive->size); > + free(archive); > +} > + > +static struct local_file_header *local_file_header_at_offset(struct zip_archive *archive, > + uint32_t offset) is there a non-local file in ZIP archive? Seems a bit too verbose, just "file_header_at_offset"? and seeing "get_entry_at_offset" below, should it be "get_file_header_at_offset" then? > +{ > + struct local_file_header *lfh = > + check_access(archive, offset, sizeof(struct local_file_header)); empty line between variables and code > + if (!lfh || unaligned_uint32_read(lfh->magic) != LOCAL_FILE_HEADER_MAGIC) { > + return NULL; > + } here and below, same {} issue > + return lfh; > +} > + > +static int get_entry_at_offset(struct zip_archive *archive, uint32_t offset, struct zip_entry *out) > +{ > + struct local_file_header *lfh = local_file_header_at_offset(archive, offset); same as above, let's not do extensive operations during variable initialization, please split > + uint16_t flags, name_length, extra_field_length; > + uint32_t compressed_size; > + const char *name; > + void *data; > + > + offset += sizeof(struct local_file_header); logically this probably should happen after we checked lfh for NULL? > + if (!lfh) { > + return -1; here and everywhere, let's use kernel error code constants, e.g., -EINVAL > + }; > + > + flags = unaligned_uint16_read(lfh->flags); > + if ((flags & FLAG_ENCRYPTED) || (flags & FLAG_HAS_DATA_DESCRIPTOR)) { > + return -1; > + } > + > + name_length = unaligned_uint16_read(lfh->file_name_length); > + name = check_access(archive, offset, name_length); > + offset += name_length; > + if (!name) { > + return -1; > + } > + > + extra_field_length = unaligned_uint16_read(lfh->extra_field_length); > + if (!check_access(archive, offset, extra_field_length)) { > + return -1; > + } > + offset += extra_field_length; > + > + compressed_size = unaligned_uint32_read(lfh->compressed_size); why not fill out out->data_length instead of all these local variables? > + data = check_access(archive, offset, compressed_size); > + if (!data) { > + return -1; > + } > + > + out->compression = unaligned_uint16_read(lfh->compression); > + out->name_length = name_length; > + out->name = name; > + out->data = data; > + out->data_length = compressed_size; > + out->data_offset = offset; > + > + return 0; > +} > + > +static struct central_directory_file_header *cd_file_header_at_offset(struct zip_archive *archive, > + uint32_t offset) this function is called just once below, why a helper? > +{ > + struct central_directory_file_header *cdfh = > + check_access(archive, offset, sizeof(struct central_directory_file_header)); empty line > + if (!cdfh || unaligned_uint32_read(cdfh->magic) != CD_FILE_HEADER_MAGIC) { > + return NULL; > + } > + return cdfh; > +} > + > +int zip_archive_find_entry(struct zip_archive *archive, const char *file_name, > + struct zip_entry *out) > +{ > + size_t file_name_length = strlen(file_name); > + no need for empty line, let's keep variable declarations in one block > + uint32_t i, offset = archive->cd_offset; > + > + for (i = 0; i < archive->cd_records; ++i) { > + struct central_directory_file_header *cdfh = > + cd_file_header_at_offset(archive, offset); > + uint16_t cdfh_name_length, cdfh_flags; > + const char *cdfh_name; > + > + offset += sizeof(struct central_directory_file_header); same, logically offset should be updated after we made sure we have cd_file_header > + if (!cdfh) { > + return -1; > + } > + > + cdfh_name_length = unaligned_uint16_read(cdfh->file_name_length); > + cdfh_name = check_access(archive, offset, cdfh_name_length); > + if (!cdfh_name) { > + return -1; > + } > + > + cdfh_flags = unaligned_uint16_read(cdfh->flags); > + if ((cdfh_flags & FLAG_ENCRYPTED) == 0 && > + (cdfh_flags & FLAG_HAS_DATA_DESCRIPTOR) == 0 && > + file_name_length == cdfh_name_length && > + memcmp(file_name, archive->data + offset, file_name_length) == 0) { > + return get_entry_at_offset(archive, unaligned_uint32_read(cdfh->offset), > + out); > + } > + > + offset += cdfh_name_length; > + offset += unaligned_uint16_read(cdfh->extra_field_length); > + offset += unaligned_uint16_read(cdfh->file_comment_length); > + } > + > + return -1; > +} > diff --git a/tools/lib/bpf/zip.h b/tools/lib/bpf/zip.h > new file mode 100644 > index 0000000..a9083f > --- /dev/null > +++ b/tools/lib/bpf/zip.h > @@ -0,0 +1,47 @@ > +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ > + > +#ifndef __LIBBPF_ZIP_H > +#define __LIBBPF_ZIP_H > + > +#include <stdint.h> > + > +/* Represents an opened zip archive. s/opened/open/ > + * Only basic ZIP files are supported, in particular the following are not > + * supported: > + * - encryption > + * - streaming > + * - multi-part ZIP files > + * - ZIP64 > + */ > +struct zip_archive; > + > +/* Carries information on name, compression method, and data corresponding to a > + * file in a zip archive. > + */ > +struct zip_entry { > + /* Compression method as defined in pkzip spec. 0 means data is uncompressed. */ > + uint16_t compression; > + > + /* Non-null terminated name of the file. */ > + const char *name; > + /* Length of the file name. */ > + uint16_t name_length; > + > + /* Pointer to the file data. */ > + const void *data; > + /* Length of the file data. */ > + uint32_t data_length; > + /* Offset of the file data within the archive. */ > + uint32_t data_offset; > +}; > + > +/* Open a zip archive. Returns NULL in case of an error. */ > +struct zip_archive *zip_archive_open(const char *path); > + > +/* Close a zip archive and release resources. */ > +void zip_archive_close(struct zip_archive *archive); > + > +/* Look up an entry corresponding to a file in given zip archive. */ > +int zip_archive_find_entry(struct zip_archive *archive, const char *name, struct zip_entry *out); > + > +#endif > -- > 2.30.2 >