Am 23.01.2017 um 20:07 schrieb Junio C Hamano: > René Scharfe <l.s.r@xxxxxx> writes: > >> Implement qsort_s() as a wrapper to the GNU version of qsort_r(1) and >> use it on Linux. Performance increases slightly: >> >> Test HEAD^ HEAD >> -------------------------------------------------------------------- >> 0071.2: sort(1) 0.10(0.20+0.02) 0.10(0.21+0.01) +0.0% >> 0071.3: string_list_sort() 0.17(0.15+0.01) 0.16(0.15+0.01) -5.9% >> >> Additionally the unstripped size of compat/qsort_s.o falls from 24576 >> to 16544 bytes in my build. >> >> IMHO these savings aren't worth the increased complexity of having to >> support two implementations. > > I do worry about having to support more implementations in the > future that have different function signature for the comparison > callbacks, which will make things ugly, but this addition alone > doesn't look too bad to me. It is unfair of me to show a 5% speedup and then recommend to not include it. ;-) That difference won't be measurable in real use cases and the patch is not necessary. This patch is simple, but the overall complexity (incl. #ifdefs etc.) will be lower without it. But here's another one, with even higher performance and with an even bigger recommendation to not include it! :) It veers off into another direction: Parallel execution. It requires thread-safe comparison functions, which might surprise callers. The value 1000 for the minimum number of items before threading kicks in is just a guess, not based on measurements. So it's quite raw -- and I'm not sure why it's still a bit slower than sort(1). Test HEAD^ HEAD --------------------------------------------------------------------- 0071.2: sort(1) 0.10(0.18+0.03) 0.10(0.20+0.02) +0.0% 0071.3: string_list_sort() 0.17(0.14+0.02) 0.11(0.18+0.02) -35.3% --- compat/qsort_s.c | 76 ++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 58 insertions(+), 18 deletions(-) diff --git a/compat/qsort_s.c b/compat/qsort_s.c index 52d1f0a73d..1304a089af 100644 --- a/compat/qsort_s.c +++ b/compat/qsort_s.c @@ -1,4 +1,5 @@ #include "../git-compat-util.h" +#include "../thread-utils.h" /* * A merge sort implementation, simplified from the qsort implementation @@ -6,29 +7,58 @@ * Added context pointer, safety checks and return value. */ -static void msort_with_tmp(void *b, size_t n, size_t s, - int (*cmp)(const void *, const void *, void *), - char *t, void *ctx) +#define MIN_ITEMS_FOR_THREAD 1000 + +struct work { + int nr_threads; + void *base; + size_t nmemb; + size_t size; + char *tmp; + int (*cmp)(const void *, const void *, void *); + void *ctx; +}; + +static void *msort_with_tmp(void *work) { + struct work one, two, *w = work; char *tmp; char *b1, *b2; size_t n1, n2; + size_t s, n; - if (n <= 1) - return; + if (w->nmemb <= 1) + return NULL; - n1 = n / 2; - n2 = n - n1; - b1 = b; - b2 = (char *)b + (n1 * s); + one = two = *w; + one.nr_threads /= 2; + two.nr_threads -= one.nr_threads; + n = one.nmemb; + s = one.size; + n1 = one.nmemb = n / 2; + n2 = two.nmemb = n - n1; + b1 = one.base; + b2 = two.base = b1 + n1 * s; + two.tmp += n1 * s; - msort_with_tmp(b1, n1, s, cmp, t, ctx); - msort_with_tmp(b2, n2, s, cmp, t, ctx); +#ifndef NO_PTHREADS + if (one.nr_threads && n > MIN_ITEMS_FOR_THREAD) { + pthread_t thread; + int err = pthread_create(&thread, NULL, msort_with_tmp, &one); + msort_with_tmp(&two); + if (err || pthread_join(thread, NULL)) + msort_with_tmp(&one); + } else +#endif + { + msort_with_tmp(&one); + msort_with_tmp(&two); + } - tmp = t; + tmp = one.tmp; while (n1 > 0 && n2 > 0) { - if (cmp(b1, b2, ctx) <= 0) { + if (one.cmp(b1, b2, one.ctx) <= 0) { memcpy(tmp, b1, s); tmp += s; b1 += s; @@ -42,7 +72,8 @@ static void msort_with_tmp(void *b, size_t n, size_t s, } if (n1 > 0) memcpy(tmp, b1, n1 * s); - memcpy(b, t, (n - n2) * s); + memcpy(one.base, one.tmp, (n - n2) * s); + return NULL; } int git_qsort_s(void *b, size_t n, size_t s, @@ -50,20 +81,29 @@ int git_qsort_s(void *b, size_t n, size_t s, { const size_t size = st_mult(n, s); char buf[1024]; + struct work w; if (!n) return 0; if (!b || !cmp) return -1; + w.nr_threads = online_cpus(); + w.base = b; + w.nmemb = n; + w.size = s; + w.cmp = cmp; + w.ctx = ctx; + if (size < sizeof(buf)) { /* The temporary array fits on the small on-stack buffer. */ - msort_with_tmp(b, n, s, cmp, buf, ctx); + w.tmp = buf; } else { /* It's somewhat large, so malloc it. */ - char *tmp = xmalloc(size); - msort_with_tmp(b, n, s, cmp, tmp, ctx); - free(tmp); + w.tmp = xmalloc(size); } + msort_with_tmp(&w); + if (w.tmp != buf) + free(w.tmp); return 0; } -- 2.11.0