Re: [PATCH v2 3/3] libselinux: performance optimization for duplicate detection

Christian Göttsche <cgzones@xxxxxxxxxxxxxx> · Fri, 24 Feb 2023 13:00:30 +0100

On Fri, 17 Feb 2023 at 09:45, wanghuizhao <wanghuizhao1@xxxxxxxxxx> wrote:
>
> When semodule -i some.pp to install a module package, duplicate items are
> detected for the module. The detection function is nodups_specs in
> libselinux/src/label_file.c. The algorithm complexity of implementing
> this function is O(N^2). In scenarios where N is very large, the efficiency
> is very low.
>
> To solve this problem, I propose to use the hash table to detect duplicates.
> The algorithm complexity of new implementing is O(N). The execution efficiency
> will be greatly improved.

Isn't the new complexity O(N * log(N)) due to the hashtable insertion
of O(log(N))?

>
> Comparison between the execution time of the nodups_specs function.
>
> Old double-layer loop implementation O(N^2):
>
> semodule -i myapp1.pp
> nodups_specs data->nspec: 5002
> nodups_specs start: 11785.242s
> nodups_specs end:   11785.588s
> nodups_specs consumes:  0.346s
>
> semodule -i myapp2.pp
> nodups_specs data->nspec: 10002
> nodups_specs start: 11804.280s
> nodups_specs end:   11806.546s
> nodups_specs consumes:  2.266s
>
> semodule -i myapp3.pp
> nodups_specs data->nspec: 20002
> nodups_specs start: 11819.106s
> nodups_specs end:   11830.892s
> nodups_specs consumes: 11.786s
>
> New hash table implementation O(N):
>
> semodule -i myapp1.pp
> nodups_specs data->nspec: 5002
> nodups_specs start: 11785.588s
> nodups_specs end:   11785.590s
> nodups_specs consumes:  0.002s
>
> semodule -i myapp2.pp
> nodups_specs data->nspec: 10002
> nodups_specs start: 11806.546s
> nodups_specs end:   11806.552s
> nodups_specs consumes:  0.006s
>
> semodule -i myapp3.pp
> nodups_specs data->nspec: 20002
> nodups_specs start: 11830.892s
> nodups_specs end:   11830.905s
> nodups_specs consumes:  0.013s
>
> Signed-off-by: wanghuizhao <wanghuizhao1@xxxxxxxxxx>
> ---
>  libselinux/src/label_file.c | 118 +++++++++++++++++++++++++++++++++++---------
>  1 file changed, 94 insertions(+), 24 deletions(-)
>
> diff --git a/libselinux/src/label_file.c b/libselinux/src/label_file.c
> index 74ae9b9f..eebf9665 100644
> --- a/libselinux/src/label_file.c
> +++ b/libselinux/src/label_file.c
> @@ -19,10 +19,17 @@
>  #include <sys/types.h>
>  #include <sys/stat.h>
>
> +#include "hashtab.h"
>  #include "callbacks.h"
>  #include "label_internal.h"
>  #include "label_file.h"
>
> +
> +struct chkdups_key {
> +       char *regex;
> +       unsigned int mode;
> +};
> +
>  /*
>   * Internals, mostly moved over from matchpathcon.c
>   */
> @@ -57,40 +64,103 @@ static int find_stem_from_file(struct saved_data *data, const char *key)
>  }
>
>  /*
> + * hash calculation and key comparison of hash table
> + */
> +
> +static unsigned int symhash(hashtab_t h, const_hashtab_key_t key)
> +{
> +       const struct chkdups_key *k = (const struct chkdups_key *)key;
> +       const char *p = NULL;
> +       size_t size;
> +       unsigned int val = 0;
> +
> +       size = strlen(k->regex);
> +       for (p = k->regex; ((size_t) (p - k->regex)) < size; p++)
> +               val =
> +                       ((val << 4) | ((val >> (8 * sizeof(unsigned int) - 4)) +
> +                       k->mode)) ^ (*p);

v1 added k->mode after the bit-wise or (probably changed by the added
parenthesis due to the compiler warning).
Using

     (((val << 4) | (val >> (8 * sizeof(unsigned int) - 4))) + k->mode) ^ (*p);

gives a slightly better spread (tested against refpolicy (master)):

    nodups_spec:  6062 entries and 606/606 buckets used, longest chain length 21
vs
    nodups_spec:  6062 entries and 606/606 buckets used, longest chain length 20

And for a adjusted hashtable size (see below):

    nodups_spec:  6062 entries and 3807/6062 buckets used, longest
chain length 6
vs
    nodups_spec:  6062 entries and 3815/6062 buckets used, longest
chain length 6

> +       return val % h->size;
> +}
> +
> +static int symcmp(hashtab_t h
> +                 __attribute__ ((unused)), const_hashtab_key_t key1,
> +                 const_hashtab_key_t key2)
> +{
> +       const struct chkdups_key *a = (const struct chkdups_key *)key1;
> +       const struct chkdups_key *b = (const struct chkdups_key *)key2;
> +
> +       return strcmp(a->regex, b->regex) || (a->mode && b->mode && a->mode != b->mode);
> +}
> +
> +static int destroy_chkdups_key(hashtab_key_t key)
> +{
> +       free(key);
> +
> +       return 0;
> +}
> +
> +/*
>   * Warn about duplicate specifications.
>   */
>  static int nodups_specs(struct saved_data *data, const char *path)
>  {
> -       int rc = 0;
> -       unsigned int ii, jj;
> +       int rc = 0, ret = 0;
> +       unsigned int ii;
>         struct spec *curr_spec, *spec_arr = data->spec_arr;
> +       struct chkdups_key *new = NULL;
> +       unsigned int hashtab_len = (data->nspec / 10) ? data->nspec / 10 : 1;
>
> +       hashtab_t hash_table = hashtab_create(symhash, symcmp, hashtab_len);

v1 used a hashtable size of `data->nspec`, instead of its tenth.
Since the hashtable implementation from newrole does not contain a
resize feature, like the one from libsepol, the size will be fixed for
its lifetime.
Using `data.>nspec` gives slightly better (but probably negligible) performance:

    Benchmark 1: DESTDIR=~/Downloads/destdir/
./scripts/env_use_destdir ~/Downloads/destdir/sbin/setfiles -c
../refpolicy/tmp/policy.bin ../refpolicy/tmp/all_mods.fc
     Time (mean ± σ):     340.4 ms ±  14.4 ms    [User: 280.6 ms,
System: 59.7 ms]
     Range (min … max):   328.1 ms … 386.0 ms    30 runs
vs
    Benchmark 1: DESTDIR=~/Downloads/destdir/
./scripts/env_use_destdir ~/Downloads/destdir/sbin/setfiles -c
../refpolicy/tmp/policy.bin ../refpolicy/tmp/all_mods.fc
     Time (mean ± σ):     334.7 ms ±   5.9 ms    [User: 279.6 ms,
System: 55.0 ms]
     Range (min … max):   327.5 ms … 362.1 ms    30 runs

Since your policy contains much more file context definitions, could
you run some benchmarks yourself?

> +       if (!hash_table) {
> +               rc = -1;
> +               COMPAT_LOG(SELINUX_ERROR, "%s: hashtab create failed.\n", path);
> +               return rc;
> +       }
>         for (ii = 0; ii < data->nspec; ii++) {
> -               curr_spec = &spec_arr[ii];
> -               for (jj = ii + 1; jj < data->nspec; jj++) {
> -                       if ((!strcmp(spec_arr[jj].regex_str,
> -                               curr_spec->regex_str))
> -                           && (!spec_arr[jj].mode || !curr_spec->mode
> -                               || spec_arr[jj].mode == curr_spec->mode)) {
> -                               rc = -1;
> -                               errno = EINVAL;
> -                               if (strcmp(spec_arr[jj].lr.ctx_raw,
> -                                           curr_spec->lr.ctx_raw)) {
> -                                       COMPAT_LOG
> -                                               (SELINUX_ERROR,
> -                                                "%s: Multiple different specifications for %s  (%s and %s).\n",
> -                                                path, curr_spec->regex_str,
> -                                                spec_arr[jj].lr.ctx_raw,
> -                                                curr_spec->lr.ctx_raw);
> -                               } else {
> -                                       COMPAT_LOG
> -                                               (SELINUX_ERROR,
> -                                                "%s: Multiple same specifications for %s.\n",
> -                                                path, curr_spec->regex_str);
> -                               }
> +               new = (struct chkdups_key *)malloc(sizeof(struct chkdups_key));
> +               if (!new) {
> +                       rc = -1;
> +                       COMPAT_LOG(SELINUX_ERROR, "%s: hashtab key create failed.\n", path);
> +                       return rc;
> +               }
> +               new->regex = spec_arr[ii].regex_str;
> +               new->mode = spec_arr[ii].mode;
> +               ret = hashtab_insert(hash_table, (hashtab_key_t)new, &spec_arr[ii]);
> +               if (ret == HASHTAB_SUCCESS)
> +                       continue;
> +               if (ret == HASHTAB_PRESENT) {
> +                       curr_spec =
> +                               (struct spec *)hashtab_search(hash_table, (hashtab_key_t)new);
> +                       rc = -1;
> +                       errno = EINVAL;
> +                       free(new);
> +                       if (strcmp(spec_arr[ii].lr.ctx_raw, curr_spec->lr.ctx_raw)) {
> +                               COMPAT_LOG
> +                                       (SELINUX_ERROR,
> +                                        "%s: Multiple different specifications for %s  (%s and %s).\n",
> +                                        path, curr_spec->regex_str,
> +                                        spec_arr[ii].lr.ctx_raw,
> +                                        curr_spec->lr.ctx_raw);
> +                       } else {
> +                               COMPAT_LOG
> +                                       (SELINUX_ERROR,
> +                                        "%s: Multiple same specifications for %s.\n",
> +                                        path, curr_spec->regex_str);
>                         }
>                 }
> +               if (ret == HASHTAB_OVERFLOW) {
> +                       rc = -1;
> +                       free(new);
> +                       COMPAT_LOG
> +                               (SELINUX_ERROR,
> +                               "%s: hashtab happen memory error.\n",
> +                               path);
> +                       break;
> +               }
>         }
> +
> +       hashtab_destroy_key(hash_table, destroy_chkdups_key);
> +
>         return rc;
>  }
>
> --
> 2.12.3
>