On Fri, 17 Feb 2023 at 09:45, wanghuizhao <wanghuizhao1@xxxxxxxxxx> wrote: > > When semodule -i some.pp to install a module package, duplicate items are > detected for the module. The detection function is nodups_specs in > libselinux/src/label_file.c. The algorithm complexity of implementing > this function is O(N^2). In scenarios where N is very large, the efficiency > is very low. > > To solve this problem, I propose to use the hash table to detect duplicates. > The algorithm complexity of new implementing is O(N). The execution efficiency > will be greatly improved. Isn't the new complexity O(N * log(N)) due to the hashtable insertion of O(log(N))? > > Comparison between the execution time of the nodups_specs function. > > Old double-layer loop implementation O(N^2): > > semodule -i myapp1.pp > nodups_specs data->nspec: 5002 > nodups_specs start: 11785.242s > nodups_specs end: 11785.588s > nodups_specs consumes: 0.346s > > semodule -i myapp2.pp > nodups_specs data->nspec: 10002 > nodups_specs start: 11804.280s > nodups_specs end: 11806.546s > nodups_specs consumes: 2.266s > > semodule -i myapp3.pp > nodups_specs data->nspec: 20002 > nodups_specs start: 11819.106s > nodups_specs end: 11830.892s > nodups_specs consumes: 11.786s > > New hash table implementation O(N): > > semodule -i myapp1.pp > nodups_specs data->nspec: 5002 > nodups_specs start: 11785.588s > nodups_specs end: 11785.590s > nodups_specs consumes: 0.002s > > semodule -i myapp2.pp > nodups_specs data->nspec: 10002 > nodups_specs start: 11806.546s > nodups_specs end: 11806.552s > nodups_specs consumes: 0.006s > > semodule -i myapp3.pp > nodups_specs data->nspec: 20002 > nodups_specs start: 11830.892s > nodups_specs end: 11830.905s > nodups_specs consumes: 0.013s > > Signed-off-by: wanghuizhao <wanghuizhao1@xxxxxxxxxx> > --- > libselinux/src/label_file.c | 118 +++++++++++++++++++++++++++++++++++--------- > 1 file changed, 94 insertions(+), 24 deletions(-) > > diff --git a/libselinux/src/label_file.c b/libselinux/src/label_file.c > index 74ae9b9f..eebf9665 100644 > --- a/libselinux/src/label_file.c > +++ b/libselinux/src/label_file.c > @@ -19,10 +19,17 @@ > #include <sys/types.h> > #include <sys/stat.h> > > +#include "hashtab.h" > #include "callbacks.h" > #include "label_internal.h" > #include "label_file.h" > > + > +struct chkdups_key { > + char *regex; > + unsigned int mode; > +}; > + > /* > * Internals, mostly moved over from matchpathcon.c > */ > @@ -57,40 +64,103 @@ static int find_stem_from_file(struct saved_data *data, const char *key) > } > > /* > + * hash calculation and key comparison of hash table > + */ > + > +static unsigned int symhash(hashtab_t h, const_hashtab_key_t key) > +{ > + const struct chkdups_key *k = (const struct chkdups_key *)key; > + const char *p = NULL; > + size_t size; > + unsigned int val = 0; > + > + size = strlen(k->regex); > + for (p = k->regex; ((size_t) (p - k->regex)) < size; p++) > + val = > + ((val << 4) | ((val >> (8 * sizeof(unsigned int) - 4)) + > + k->mode)) ^ (*p); v1 added k->mode after the bit-wise or (probably changed by the added parenthesis due to the compiler warning). Using (((val << 4) | (val >> (8 * sizeof(unsigned int) - 4))) + k->mode) ^ (*p); gives a slightly better spread (tested against refpolicy (master)): nodups_spec: 6062 entries and 606/606 buckets used, longest chain length 21 vs nodups_spec: 6062 entries and 606/606 buckets used, longest chain length 20 And for a adjusted hashtable size (see below): nodups_spec: 6062 entries and 3807/6062 buckets used, longest chain length 6 vs nodups_spec: 6062 entries and 3815/6062 buckets used, longest chain length 6 > + return val % h->size; > +} > + > +static int symcmp(hashtab_t h > + __attribute__ ((unused)), const_hashtab_key_t key1, > + const_hashtab_key_t key2) > +{ > + const struct chkdups_key *a = (const struct chkdups_key *)key1; > + const struct chkdups_key *b = (const struct chkdups_key *)key2; > + > + return strcmp(a->regex, b->regex) || (a->mode && b->mode && a->mode != b->mode); > +} > + > +static int destroy_chkdups_key(hashtab_key_t key) > +{ > + free(key); > + > + return 0; > +} > + > +/* > * Warn about duplicate specifications. > */ > static int nodups_specs(struct saved_data *data, const char *path) > { > - int rc = 0; > - unsigned int ii, jj; > + int rc = 0, ret = 0; > + unsigned int ii; > struct spec *curr_spec, *spec_arr = data->spec_arr; > + struct chkdups_key *new = NULL; > + unsigned int hashtab_len = (data->nspec / 10) ? data->nspec / 10 : 1; > > + hashtab_t hash_table = hashtab_create(symhash, symcmp, hashtab_len); v1 used a hashtable size of `data->nspec`, instead of its tenth. Since the hashtable implementation from newrole does not contain a resize feature, like the one from libsepol, the size will be fixed for its lifetime. Using `data.>nspec` gives slightly better (but probably negligible) performance: Benchmark 1: DESTDIR=~/Downloads/destdir/ ./scripts/env_use_destdir ~/Downloads/destdir/sbin/setfiles -c ../refpolicy/tmp/policy.bin ../refpolicy/tmp/all_mods.fc Time (mean ± σ): 340.4 ms ± 14.4 ms [User: 280.6 ms, System: 59.7 ms] Range (min … max): 328.1 ms … 386.0 ms 30 runs vs Benchmark 1: DESTDIR=~/Downloads/destdir/ ./scripts/env_use_destdir ~/Downloads/destdir/sbin/setfiles -c ../refpolicy/tmp/policy.bin ../refpolicy/tmp/all_mods.fc Time (mean ± σ): 334.7 ms ± 5.9 ms [User: 279.6 ms, System: 55.0 ms] Range (min … max): 327.5 ms … 362.1 ms 30 runs Since your policy contains much more file context definitions, could you run some benchmarks yourself? > + if (!hash_table) { > + rc = -1; > + COMPAT_LOG(SELINUX_ERROR, "%s: hashtab create failed.\n", path); > + return rc; > + } > for (ii = 0; ii < data->nspec; ii++) { > - curr_spec = &spec_arr[ii]; > - for (jj = ii + 1; jj < data->nspec; jj++) { > - if ((!strcmp(spec_arr[jj].regex_str, > - curr_spec->regex_str)) > - && (!spec_arr[jj].mode || !curr_spec->mode > - || spec_arr[jj].mode == curr_spec->mode)) { > - rc = -1; > - errno = EINVAL; > - if (strcmp(spec_arr[jj].lr.ctx_raw, > - curr_spec->lr.ctx_raw)) { > - COMPAT_LOG > - (SELINUX_ERROR, > - "%s: Multiple different specifications for %s (%s and %s).\n", > - path, curr_spec->regex_str, > - spec_arr[jj].lr.ctx_raw, > - curr_spec->lr.ctx_raw); > - } else { > - COMPAT_LOG > - (SELINUX_ERROR, > - "%s: Multiple same specifications for %s.\n", > - path, curr_spec->regex_str); > - } > + new = (struct chkdups_key *)malloc(sizeof(struct chkdups_key)); > + if (!new) { > + rc = -1; > + COMPAT_LOG(SELINUX_ERROR, "%s: hashtab key create failed.\n", path); > + return rc; > + } > + new->regex = spec_arr[ii].regex_str; > + new->mode = spec_arr[ii].mode; > + ret = hashtab_insert(hash_table, (hashtab_key_t)new, &spec_arr[ii]); > + if (ret == HASHTAB_SUCCESS) > + continue; > + if (ret == HASHTAB_PRESENT) { > + curr_spec = > + (struct spec *)hashtab_search(hash_table, (hashtab_key_t)new); > + rc = -1; > + errno = EINVAL; > + free(new); > + if (strcmp(spec_arr[ii].lr.ctx_raw, curr_spec->lr.ctx_raw)) { > + COMPAT_LOG > + (SELINUX_ERROR, > + "%s: Multiple different specifications for %s (%s and %s).\n", > + path, curr_spec->regex_str, > + spec_arr[ii].lr.ctx_raw, > + curr_spec->lr.ctx_raw); > + } else { > + COMPAT_LOG > + (SELINUX_ERROR, > + "%s: Multiple same specifications for %s.\n", > + path, curr_spec->regex_str); > } > } > + if (ret == HASHTAB_OVERFLOW) { > + rc = -1; > + free(new); > + COMPAT_LOG > + (SELINUX_ERROR, > + "%s: hashtab happen memory error.\n", > + path); > + break; > + } > } > + > + hashtab_destroy_key(hash_table, destroy_chkdups_key); > + > return rc; > } > > -- > 2.12.3 >