This optimization reduces the average number of comparisons required from 2*n*log2(n) - 3*n + o(n) to n*log2(n) + 0.37*n + o(n). When n is sufficiently large, it results in approximately 50% fewer comparisons. Currently, eytzinger0_sort employs the textbook version of heapsort, where during the heapify process, each level requires two comparisons to determine the maximum among three elements. In contrast, the bottom-up heapsort, during heapify, only compares two children at each level until reaching a leaf node. Then, it backtracks from the leaf node to find the correct position. Since heapify typically continues until very close to the leaf node, the standard heapify requires about 2*log2(n) comparisons, while the bottom-up variant only needs log2(n) comparisons. The experimental data presented below is based on an array generated by get_random_u32(). | N | comparisons(old) | comparisons(new) | time(old) | time(new) | |-------|------------------|------------------|-----------|-----------| | 10000 | 235381 | 136615 | 25545 us | 20366 us | | 20000 | 510694 | 293425 | 31336 us | 18312 us | | 30000 | 800384 | 457412 | 35042 us | 27386 us | | 40000 | 1101617 | 626831 | 48779 us | 38253 us | | 50000 | 1409762 | 799637 | 62238 us | 46950 us | | 60000 | 1721191 | 974521 | 75588 us | 58367 us | | 70000 | 2038536 | 1152171 | 90823 us | 68778 us | | 80000 | 2362958 | 1333472 | 104165 us | 78625 us | | 90000 | 2690900 | 1516065 | 116111 us | 89573 us | | 100000| 3019413 | 1699879 | 133638 us | 100998 us | Refs: BOTTOM-UP-HEAPSORT, a new variant of HEAPSORT beating, on an average, QUICKSORT (if n is not very small) Ingo Wegener Theoretical Computer Science, 118(1); Pages 81-98, 13 September 1993 https://doi.org/10.1016/0304-3975(93)90364-Y Signed-off-by: Kuan-Wei Chiu <visitorckw@xxxxxxxxx> --- This patch has undergone unit testing and micro benchmarking using the following code [1]. [1]: static long long int cmp_count = 0; static int mycmp(const void *a, const void *b, size_t size) { u32 _a = *(u32 *)a; u32 _b = *(u32 *)b; cmp_count++; if (_a < _b) return -1; else if (_a > _b) return 1; else return 0; } static int test(void) { size_t N, i, L, R; ktime_t start, end; s64 delta; u32 *arr; for (N = 10000; N <= 100000; N += 10000) { arr = kmalloc_array(N, sizeof(u32), GFP_KERNEL); cmp_count = 0; for (i = 0; i < N; i++) arr[i] = get_random_u32(); start = ktime_get(); eytzinger0_sort(arr, N, sizeof(u32), mycmp, NULL); end = ktime_get(); delta = ktime_us_delta(end, start); printk(KERN_INFO "time: %lld\n", delta); printk(KERN_INFO "comparisons: %lld\n", cmp_count); for (int i = 0; i < N; i++) { L = 2 * i + 1; R = 2 * i + 2; if (L < N && arr[i] < arr[L]) goto err; if (R < N && arr[i] > arr[R]) goto err; } kfree(arr); } return 0; err: kfree(arr); return -1; } fs/bcachefs/util.c | 50 +++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 20 deletions(-) diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index c2ef7cddaa4f..bbc83b43162e 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -911,7 +911,7 @@ void eytzinger0_sort(void *base, size_t n, size_t size, int (*cmp_func)(const void *, const void *, size_t), void (*swap_func)(void *, void *, size_t)) { - int i, c, r; + int i, j, k; if (!swap_func) { if (size == 4 && alignment_ok(base, 4)) @@ -924,17 +924,22 @@ void eytzinger0_sort(void *base, size_t n, size_t size, /* heapify */ for (i = n / 2 - 1; i >= 0; --i) { - for (r = i; r * 2 + 1 < n; r = c) { - c = r * 2 + 1; - - if (c + 1 < n && - do_cmp(base, n, size, cmp_func, c, c + 1) < 0) - c++; - - if (do_cmp(base, n, size, cmp_func, r, c) >= 0) - break; - - do_swap(base, n, size, swap_func, r, c); + /* Find the sift-down path all the way to the leaves. */ + for (j = i; k = j * 2 + 1, k + 1 < n;) + j = do_cmp(base, n, size, cmp_func, k, k + 1) > 0 ? k : k + 1; + + /* Special case for the last leaf with no sibling. */ + if (j * 2 + 2 == n) + j = j * 2 + 1; + + /* Backtrack to the correct location. */ + while (j != i && do_cmp(base, n, size, cmp_func, i, j) >= 0) + j = (j - 1) / 2; + + /* Shift the element into its correct place. */ + for (k = j; j != i;) { + j = (j - 1) / 2; + do_swap(base, n, size, swap_func, j, k); } } @@ -942,17 +947,22 @@ void eytzinger0_sort(void *base, size_t n, size_t size, for (i = n - 1; i > 0; --i) { do_swap(base, n, size, swap_func, 0, i); - for (r = 0; r * 2 + 1 < i; r = c) { - c = r * 2 + 1; + /* Find the sift-down path all the way to the leaves. */ + for (j = 0; k = j * 2 + 1, k + 1 < i;) + j = do_cmp(base, n, size, cmp_func, k, k + 1) > 0 ? k : k + 1; - if (c + 1 < i && - do_cmp(base, n, size, cmp_func, c, c + 1) < 0) - c++; + /* Special case for the last leaf with no sibling. */ + if (j * 2 + 2 == i) + j = j * 2 + 1; - if (do_cmp(base, n, size, cmp_func, r, c) >= 0) - break; + /* Backtrack to the correct location. */ + while (j && do_cmp(base, n, size, cmp_func, 0, j) >= 0) + j = (j - 1) / 2; - do_swap(base, n, size, swap_func, r, c); + /* Shift the element into its correct place. */ + for (k = j; j;) { + j = (j - 1) / 2; + do_swap(base, n, size, swap_func, j, k); } } } -- 2.25.1