Re: [PATCH bpf-next 1/3] libbpf: Implement basic zip archive parsing support

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Feb 17, 2023 at 04:10:04PM -0800, Andrii Nakryiko wrote:
> On Fri, Feb 17, 2023 at 11:19 AM Daniel Müller <deso@xxxxxxxxxx> wrote:
> >
> > This change implements support for reading zip archives, including
> > opening an archive, finding an entry based on its path and name in it,
> > and closing it.
> > The code was copied from https://github.com/iovisor/bcc/pull/4440, which
> > implements similar functionality for bcc. The author confirmed that he
> > is fine with this usage and the corresponding relicensing. I adjusted it
> > to adhere to libbpf coding standards.
> 
> Let's record this with
> 
> Acked-by: Michał Gregorczyk <michalgr@xxxxxxxx>
> 
> tag added next to your Signed-off-by. I've added Michal to cc for visibility.
> 
> >
> > Signed-off-by: Daniel Müller <deso@xxxxxxxxxx>
> > ---
> >  tools/lib/bpf/Build |   2 +-
> >  tools/lib/bpf/zip.c | 378 ++++++++++++++++++++++++++++++++++++++++++++
> >  tools/lib/bpf/zip.h |  47 ++++++
> >  3 files changed, 426 insertions(+), 1 deletion(-)
> >  create mode 100644 tools/lib/bpf/zip.c
> >  create mode 100644 tools/lib/bpf/zip.h
> >
> > diff --git a/tools/lib/bpf/Build b/tools/lib/bpf/Build
> > index 5a3dfb..b8b0a63 100644
> > --- a/tools/lib/bpf/Build
> > +++ b/tools/lib/bpf/Build
> > @@ -1,4 +1,4 @@
> >  libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \
> >             netlink.o bpf_prog_linfo.o libbpf_probes.o hashmap.o \
> >             btf_dump.o ringbuf.o strset.o linker.o gen_loader.o relo_core.o \
> > -           usdt.o
> > +           usdt.o zip.o
> > diff --git a/tools/lib/bpf/zip.c b/tools/lib/bpf/zip.c
> > new file mode 100644
> > index 0000000..59ec79
> > --- /dev/null
> > +++ b/tools/lib/bpf/zip.c
> > @@ -0,0 +1,378 @@
> > +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
> > +/*
> > + * Routines for dealing with .zip archives.
> > + *
> > + * Copyright (c) Meta Platforms, Inc. and affiliates.
> > + */
> > +
> > +#include <fcntl.h>
> > +#include <stdint.h>
> > +#include <stdlib.h>
> > +#include <string.h>
> > +#include <sys/mman.h>
> > +#include <unistd.h>
> > +
> > +#include "zip.h"
> > +
> > +/* Specification of ZIP file format can be found here:
> > + * https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT
> > + * For a high level overview of the structure of a ZIP file see
> > + * sections 4.3.1 - 4.3.6.
> > + *
> > + * Data structures appearing in ZIP files do not contain any
> > + * padding and they might be misaligned. To allow us to safely
> > + * operate on pointers to such structures and their members, without
> > + * worrying of platform specific alignment issues, we define
> > + * unaligned_uint16_t and unaligned_uint32_t types with no alignment
> > + * requirements.
> > + */
> > +typedef struct {
> > +       uint8_t raw[2];
> > +} unaligned_uint16_t;
> > +
> > +static uint16_t unaligned_uint16_read(unaligned_uint16_t value)
> > +{
> > +       uint16_t return_value;
> > +
> > +       memcpy(&return_value, value.raw, sizeof(return_value));
> > +       return return_value;
> > +}
> > +
> > +typedef struct {
> > +       uint8_t raw[4];
> > +} unaligned_uint32_t;
> > +
> > +static uint32_t unaligned_uint32_read(unaligned_uint32_t value)
> > +{
> > +       uint32_t return_value;
> > +
> > +       memcpy(&return_value, value.raw, sizeof(return_value));
> > +       return return_value;
> > +}
> 
> it feels like an overkill to use this unaligned_xxx_t approach. Let's
> use __attribute__((packed)) on corresponding structs and just
> uint16_t/uint32_t field types.
> 
> > +
> > +#define END_OF_CD_RECORD_MAGIC 0x06054b50
> > +
> > +/* See section 4.3.16 of the spec. */
> > +struct end_of_central_directory_record {
> > +       /* Magic value equal to END_OF_CD_RECORD_MAGIC */
> > +       unaligned_uint32_t magic;
> > +
> > +       /* Number of the file containing this structure or 0xFFFF if ZIP64 archive.
> > +        * Zip archive might span multiple files (disks).
> > +        */
> > +       unaligned_uint16_t this_disk;
> > +
> > +       /* Number of the file containing the beginning of the central directory or
> > +        * 0xFFFF if ZIP64 archive.
> > +        */
> > +       unaligned_uint16_t cd_disk;
> > +
> > +       /* Number of central directory records on this disk or 0xFFFF if ZIP64
> > +        * archive.
> > +        */
> > +       unaligned_uint16_t cd_records;
> > +
> > +       /* Number of central directory records on all disks or 0xFFFF if ZIP64
> > +        * archive.
> > +        */
> > +       unaligned_uint16_t cd_records_total;
> > +
> > +       /* Size of the central directory recrod or 0xFFFFFFFF if ZIP64 archive. */
> 
> typo: record
> 
> > +       unaligned_uint32_t cd_size;
> > +
> > +       /* Offset of the central directory from the beginning of the archive or
> > +        * 0xFFFFFFFF if ZIP64 archive.
> > +        */
> > +       unaligned_uint32_t cd_offset;
> > +
> > +       /* Length of comment data following end of central driectory record. */
> 
> typo: directory
> 
> 
> > +       unaligned_uint16_t comment_length;
> > +
> > +       /* Up to 64k of arbitrary bytes. */
> > +       /* uint8_t comment[comment_length] */
> > +};
> > +
> > +#define CD_FILE_HEADER_MAGIC 0x02014b50
> > +#define FLAG_ENCRYPTED (1 << 0)
> > +#define FLAG_HAS_DATA_DESCRIPTOR (1 << 3)
> > +
> > +/* See section 4.3.12 of the spec. */
> > +struct central_directory_file_header {
> 
> 
> naming nit here and below: we use CD in constants, but spell out
> "central_directory", which feels a bit mouthful. Let's use "cd"
> consistently throughout (even if it's a bit confusing with "change
> directory", but I think in the context of parsing ZIP contents should
> be recognizable enough)

Sure.

> > +       /* Magic value equal to CD_FILE_HEADER_MAGIC. */
> > +       unaligned_uint32_t magic;
> > +       unaligned_uint16_t version;
> > +       /* Minimum zip version needed to extract the file. */
> > +       unaligned_uint16_t min_version;
> > +       unaligned_uint16_t flags;
> > +       unaligned_uint16_t compression;
> > +       unaligned_uint16_t last_modified_time;
> > +       unaligned_uint16_t last_modified_date;
> > +       unaligned_uint32_t crc;
> > +       unaligned_uint32_t compressed_size;
> > +       unaligned_uint32_t uncompressed_size;
> > +       unaligned_uint16_t file_name_length;
> > +       unaligned_uint16_t extra_field_length;
> > +       unaligned_uint16_t file_comment_length;
> > +       /* Number of the disk where the file starts or 0xFFFF if ZIP64 archive. */
> > +       unaligned_uint16_t disk;
> > +       unaligned_uint16_t internal_attributes;
> > +       unaligned_uint32_t external_attributes;
> > +       /* Offset from the start of the disk containing the local file header to the
> > +        * start of the local file header.
> > +        */
> > +       unaligned_uint32_t offset;
> > +};
> > +
> > +#define LOCAL_FILE_HEADER_MAGIC 0x04034b50
> > +
> > +/* See section 4.3.7 of the spec. */
> > +struct local_file_header {
> > +       /* Magic value equal to LOCAL_FILE_HEADER_MAGIC. */
> > +       unaligned_uint32_t magic;
> > +       /* Minimum zip version needed to extract the file. */
> > +       unaligned_uint16_t min_version;
> > +       unaligned_uint16_t flags;
> > +       unaligned_uint16_t compression;
> > +       unaligned_uint16_t last_modified_time;
> > +       unaligned_uint16_t last_modified_date;
> > +       unaligned_uint32_t crc;
> > +       unaligned_uint32_t compressed_size;
> > +       unaligned_uint32_t uncompressed_size;
> > +       unaligned_uint16_t file_name_length;
> > +       unaligned_uint16_t extra_field_length;
> > +};
> > +
> > +struct zip_archive {
> > +       void *data;
> > +       uint32_t size;
> > +       uint32_t cd_offset;
> > +       uint32_t cd_records;
> > +};
> > +
> 
> $ rg -w 'uint\d\d_t' | wc -l
> 21
> $ rg -w '__u\d\d' | wc -l
> 873
> 
> seems like we overwhelmingly prefer __u32/__u16 in libbpf code base,
> let's use those instead?

Sure.

> > +static void *check_access(struct zip_archive *archive, uint32_t offset, uint32_t size)
> > +{
> > +       if (offset + size > archive->size || offset > offset + size) {
> > +               return NULL;
> > +       }
> > +       return archive->data + offset;
> > +}
> > +
> > +/* Returns 0 on success, -1 on error and -2 if the eocd indicates
> 
> let's use -EINVAL and -ENOTSUP instead of -1 and -2

Sure.

> > + * the archive uses features which are not supported.
> > + */
> > +static int try_parse_end_of_central_directory(struct zip_archive *archive, uint32_t offset)
> > +{
> > +       struct end_of_central_directory_record *eocd =
> > +               check_access(archive, offset, sizeof(struct end_of_central_directory_record));
> > +       uint16_t comment_length, cd_records;
> > +       uint32_t cd_offset, cd_size;
> > +
> > +       if (!eocd || unaligned_uint32_read(eocd->magic) != END_OF_CD_RECORD_MAGIC) {
> > +               return -1;
> > +       }
> > +
> > +       comment_length = unaligned_uint16_read(eocd->comment_length);
> > +       if (offset + sizeof(struct end_of_central_directory_record) + comment_length !=
> > +           archive->size) {
> > +               return -1;
> > +       }
> > +
> > +       cd_records = unaligned_uint16_read(eocd->cd_records);
> > +       if (unaligned_uint16_read(eocd->this_disk) != 0 ||
> > +           unaligned_uint16_read(eocd->cd_disk) != 0 ||
> > +           unaligned_uint16_read(eocd->cd_records_total) != cd_records) {
> > +               /* This is a valid eocd, but we only support single-file non-ZIP64 archives. */
> > +               return -2;
> > +       }
> > +
> > +       cd_offset = unaligned_uint32_read(eocd->cd_offset);
> > +       cd_size = unaligned_uint32_read(eocd->cd_size);
> > +       if (!check_access(archive, cd_offset, cd_size)) {
> > +               return -1;
> > +       }
> > +
> > +       archive->cd_offset = cd_offset;
> > +       archive->cd_records = cd_records;
> > +       return 0;
> > +}
> > +
> > +static int find_central_directory(struct zip_archive *archive)
> > +{
> > +       uint32_t offset;
> > +       int64_t limit;
> > +       int rc = -1;
> > +
> > +       if (archive->size <= sizeof(struct end_of_central_directory_record)) {
> > +               return -1;
> > +       }
> > +
> > +       /* Because the end of central directory ends with a variable length array of
> > +        * up to 0xFFFF bytes we can't know exactly where it starts and need to
> > +        * search for it at the end of the file, scanning the (limit, offset] range.
> > +        */
> > +       offset = archive->size - sizeof(struct end_of_central_directory_record);
> > +       limit = (int64_t)offset - (1 << 16);
> > +
> > +       for (; offset >= 0 && offset > limit && rc == -1; offset--) {
> > +               rc = try_parse_end_of_central_directory(archive, offset);
> > +       }
> > +
> > +       return rc;
> > +}
> > +
> > +struct zip_archive *zip_archive_open(const char *path)
> > +{
> > +       struct zip_archive *archive;
> > +       int fd = open(path, O_RDONLY);
> 
> let's not do complicated operations during variable initialization,
> let's just init fd right before checking it for <0
> 
> > +       off_t size;
> > +       void *data;
> > +
> > +       if (fd < 0) {
> > +               return NULL;
> > +       }
> 
> no {} for single-line if statement
> 
> > +
> > +       size = lseek(fd, 0, SEEK_END);
> > +       if (size == (off_t)-1 || size > UINT32_MAX) {
> > +               close(fd);
> > +               return NULL;
> > +       }
> > +
> > +       data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
> > +       close(fd);
> > +
> > +       if (data == MAP_FAILED) {
> > +               return NULL;
> > +       }
> 
> ditto about {}
> 
> > +
> > +       archive = malloc(sizeof(struct zip_archive));
> > +       if (!archive) {
> > +               munmap(data, size);
> > +               return NULL;
> > +       };
> > +
> > +       archive->data = data;
> > +       archive->size = size;
> > +       if (find_central_directory(archive)) {
> > +               munmap(data, size);
> > +               free(archive);
> > +               archive = NULL;
> 
> return NULL; ?
> 
> > +       }
> > +
> > +       return archive;
> > +}
> > +
> > +void zip_archive_close(struct zip_archive *archive)
> > +{
> > +       munmap(archive->data, archive->size);
> > +       free(archive);
> > +}
> > +
> > +static struct local_file_header *local_file_header_at_offset(struct zip_archive *archive,
> > +                                                            uint32_t offset)
> 
> is there a non-local file in ZIP archive? Seems a bit too verbose,
> just "file_header_at_offset"? and seeing "get_entry_at_offset" below,
> should it be "get_file_header_at_offset" then?

It's called that by the standard. See section 4.3.7  Local file header in
https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT

> > +{
> > +       struct local_file_header *lfh =
> > +               check_access(archive, offset, sizeof(struct local_file_header));
> 
> empty line between variables and code
> 
> > +       if (!lfh || unaligned_uint32_read(lfh->magic) != LOCAL_FILE_HEADER_MAGIC) {
> > +               return NULL;
> > +       }
> 
> here and below, same {} issue
> 
> > +       return lfh;
> > +}
> > +
> > +static int get_entry_at_offset(struct zip_archive *archive, uint32_t offset, struct zip_entry *out)
> > +{
> > +       struct local_file_header *lfh = local_file_header_at_offset(archive, offset);
> 
> same as above, let's not do extensive operations during variable
> initialization, please split
> 
> > +       uint16_t flags, name_length, extra_field_length;
> > +       uint32_t compressed_size;
> > +       const char *name;
> > +       void *data;
> > +
> > +       offset += sizeof(struct local_file_header);
> 
> logically this probably should happen after we checked lfh for NULL?

Sure.

> > +       if (!lfh) {
> > +               return -1;
> 
> here and everywhere, let's use kernel error code constants, e.g., -EINVAL

Sure.

> > +       };
> > +
> > +       flags = unaligned_uint16_read(lfh->flags);
> > +       if ((flags & FLAG_ENCRYPTED) || (flags & FLAG_HAS_DATA_DESCRIPTOR)) {
> > +               return -1;
> > +       }
> > +
> > +       name_length = unaligned_uint16_read(lfh->file_name_length);
> > +       name = check_access(archive, offset, name_length);
> > +       offset += name_length;
> > +       if (!name) {
> > +               return -1;
> > +       }
> > +
> > +       extra_field_length = unaligned_uint16_read(lfh->extra_field_length);
> > +       if (!check_access(archive, offset, extra_field_length)) {
> > +               return -1;
> > +       }
> > +       offset += extra_field_length;
> > +
> > +       compressed_size = unaligned_uint32_read(lfh->compressed_size);
> 
> why not fill out out->data_length instead of all these local variables?

Because the function contract is that 'out' is left untouched on error.

> > +       data = check_access(archive, offset, compressed_size);
> > +       if (!data) {
> > +               return -1;
> > +       }
> > +
> > +       out->compression = unaligned_uint16_read(lfh->compression);
> > +       out->name_length = name_length;
> > +       out->name = name;
> > +       out->data = data;
> > +       out->data_length = compressed_size;
> > +       out->data_offset = offset;
> > +
> > +       return 0;
> > +}
> > +
> > +static struct central_directory_file_header *cd_file_header_at_offset(struct zip_archive *archive,
> > +                                                                     uint32_t offset)
> 
> this function is called just once below, why a helper?

Will inline it.

> > +{
> > +       struct central_directory_file_header *cdfh =
> > +               check_access(archive, offset, sizeof(struct central_directory_file_header));
> 
> empty line
> 
> > +       if (!cdfh || unaligned_uint32_read(cdfh->magic) != CD_FILE_HEADER_MAGIC) {
> > +               return NULL;
> > +       }
> > +       return cdfh;
> > +}
> > +
> > +int zip_archive_find_entry(struct zip_archive *archive, const char *file_name,
> > +                          struct zip_entry *out)
> > +{
> > +       size_t file_name_length = strlen(file_name);
> > +
> 
> no need for empty line, let's keep variable declarations in one block
> 
> > +       uint32_t i, offset = archive->cd_offset;
> > +
> > +       for (i = 0; i < archive->cd_records; ++i) {
> > +               struct central_directory_file_header *cdfh =
> > +                       cd_file_header_at_offset(archive, offset);
> > +               uint16_t cdfh_name_length, cdfh_flags;
> > +               const char *cdfh_name;
> > +
> > +               offset += sizeof(struct central_directory_file_header);
> 
> same, logically offset should be updated after we made sure we have
> cd_file_header
> 
> > +               if (!cdfh) {
> > +                       return -1;
> > +               }
> > +
> > +               cdfh_name_length = unaligned_uint16_read(cdfh->file_name_length);
> > +               cdfh_name = check_access(archive, offset, cdfh_name_length);
> > +               if (!cdfh_name) {
> > +                       return -1;
> > +               }
> > +
> > +               cdfh_flags = unaligned_uint16_read(cdfh->flags);
> > +               if ((cdfh_flags & FLAG_ENCRYPTED) == 0 &&
> > +                   (cdfh_flags & FLAG_HAS_DATA_DESCRIPTOR) == 0 &&
> > +                   file_name_length == cdfh_name_length &&
> > +                   memcmp(file_name, archive->data + offset, file_name_length) == 0) {
> > +                       return get_entry_at_offset(archive, unaligned_uint32_read(cdfh->offset),
> > +                                                  out);
> > +               }
> > +
> > +               offset += cdfh_name_length;
> > +               offset += unaligned_uint16_read(cdfh->extra_field_length);
> > +               offset += unaligned_uint16_read(cdfh->file_comment_length);
> > +       }
> > +
> > +       return -1;
> > +}
> > diff --git a/tools/lib/bpf/zip.h b/tools/lib/bpf/zip.h
> > new file mode 100644
> > index 0000000..a9083f
> > --- /dev/null
> > +++ b/tools/lib/bpf/zip.h
> > @@ -0,0 +1,47 @@
> > +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
> > +
> > +#ifndef __LIBBPF_ZIP_H
> > +#define __LIBBPF_ZIP_H
> > +
> > +#include <stdint.h>
> > +
> > +/* Represents an opened zip archive.
> 
> s/opened/open/
> 
> 
> > + * Only basic ZIP files are supported, in particular the following are not
> > + * supported:
> > + * - encryption
> > + * - streaming
> > + * - multi-part ZIP files
> > + * - ZIP64
> > + */
> > +struct zip_archive;
> > +
> > +/* Carries information on name, compression method, and data corresponding to a
> > + * file in a zip archive.
> > + */
> > +struct zip_entry {
> > +       /* Compression method as defined in pkzip spec. 0 means data is uncompressed. */
> > +       uint16_t compression;
> > +
> > +       /* Non-null terminated name of the file. */
> > +       const char *name;
> > +       /* Length of the file name. */
> > +       uint16_t name_length;
> > +
> > +       /* Pointer to the file data. */
> > +       const void *data;
> > +       /* Length of the file data. */
> > +       uint32_t data_length;
> > +       /* Offset of the file data within the archive. */
> > +       uint32_t data_offset;
> > +};
> > +
> > +/* Open a zip archive. Returns NULL in case of an error. */
> > +struct zip_archive *zip_archive_open(const char *path);
> > +
> > +/* Close a zip archive and release resources. */
> > +void zip_archive_close(struct zip_archive *archive);
> > +
> > +/* Look up an entry corresponding to a file in given zip archive. */
> > +int zip_archive_find_entry(struct zip_archive *archive, const char *name, struct zip_entry *out);
> > +
> > +#endif
> > --
> > 2.30.2
> >



[Index of Archives]     [Linux Samsung SoC]     [Linux Rockchip SoC]     [Linux Actions SoC]     [Linux for Synopsys ARC Processors]     [Linux NFS]     [Linux NILFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]


  Powered by Linux