[RFC PATCH v2 0/5] range-diff: fix segfault due to integer overflow

Ævar Arnfjörð Bjarmason <avarab@xxxxxxxxx> · Fri, 10 Dec 2021 13:30:37 +0100

Now a much smaller and hopefully more sensible fix for an overflow in
range-diff, thanks a lot to Jeff King for comments on the previous
(bad) direction in v1.

Ævar Arnfjörð Bjarmason (5):
  range-diff: zero out elements in "cost" first
  linear-assignment.c: split up compute_assignment() function
  linear-assignment.c: take "size_t", not "int" for *_count
  range-diff.c: rename "n" to "column_count" in get_correspondences()
  range-diff: fix integer overflow & segfault on cost[i + n * j]

 linear-assignment.c | 110 +++++++++++++++++++++++++++++++-------------
 linear-assignment.h |  20 +++++++-
 range-diff.c        |  25 +++++-----
 3 files changed, 107 insertions(+), 48 deletions(-)

Range-diff against v1:
 1:  7c929096381 <  -:  ----------- string-list API: change "nr" and "alloc" to "size_t"
 2:  bd7d014c531 <  -:  ----------- range-diff.c: don't use st_mult() for signed "int"
 3:  183418f1223 <  -:  ----------- range-diff.c: use "size_t" to refer to "struct string_list"'s "nr"
 4:  fe9dcb2d453 !  1:  068c203adc6 range-diff: zero out elements in "cost" first
    @@ linear-assignment.c: void compute_assignment(int column_count, int row_count, in
      ## range-diff.c ##
     @@ range-diff.c: static void get_correspondences(struct string_list *a, struct string_list *b,
      	int *cost, c, *a2b, *b2a;
    - 	size_t i, j;
    + 	int i, j;

     -	ALLOC_ARRAY(cost, st_mult(n, n));
     -	ALLOC_ARRAY(a2b, n);
 5:  0e1e2d107cd =  2:  2233872545e linear-assignment.c: split up compute_assignment() function
 6:  9b697720e00 =  3:  580b76c0759 linear-assignment.c: take "size_t", not "int" for *_count
10:  46395080b64 !  4:  f8bbe1954fc linear-assignment.c: use "intmax_t" instead of "int"
    @@ Metadata
     Author: Ævar Arnfjörð Bjarmason <avarab@xxxxxxxxx>

      ## Commit message ##
    -    linear-assignment.c: use "intmax_t" instead of "int"
    +    range-diff.c: rename "n" to "column_count" in get_correspondences()

    -    Change the "int" type used by compute_assignment() to "intmax_t". On
    -    64 bit systems this changes the overflow "die" added in the preceding
    -    commit (which before that was a segfault) to something that merely
    -    takes a very long time and a lot of memory to run.
    -
    -    On my relatively beefy system this completes:
    -
    -        git -P range-diff --creation-factor=50 origin/master...git-for-windows/main
    -
    -    In around 300 seconds, with a reported max RSS of just under 18GB, but
    -    it does give you correct results for all ~50k commitsin that range.
    +    In preparation for using the COST macro in linear-assignment.c rename
    +    the "n" variable, it assumes that the "n" in "a + n * b" is named
    +    "column_count".

         Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@xxxxxxxxx>

    - ## linear-assignment.c ##
    -@@
    - #include "linear-assignment.h"
    - #include "compat/gnulib/intprops.h"
    - 
    --static inline int cost_index(int *cost, int a, int b, int c)
    -+static inline intmax_t cost_index(intmax_t *cost, intmax_t a, intmax_t b, intmax_t c)
    - {
    --	int r;
    -+	intmax_t r;
    - 
    - 	if (INT_MULTIPLY_WRAPV(a, c, &r))
    --		die(_("integer overflow in cost[%d + %d * %d] multiplication"), b, a, c);
    -+		die(_("integer overflow in cost[%"PRIuMAX" + %"PRIuMAX" * %"PRIuMAX"] multiplication"), b, a, c);
    - 	if (INT_ADD_WRAPV(b, r, &r))
    --		die(_("integer overflow in cost[%d + ((%d * %d) = %d)] addition"), b, a, c, r);
    -+		die(_("integer overflow in cost[%"PRIuMAX" + ((%"PRIuMAX" * %"PRIuMAX") = %"PRIuMAX")] addition"), b, a, c, r);
    - 
    - 	return r;
    - }
    -@@ linear-assignment.c: static inline int cost_index(int *cost, int a, int b, int c)
    - #define COST(column, row) cost[cost_index(cost, column_count, column, row)]
    - 
    - static void columns_reduction(size_t column_count, size_t row_count,
    --			      int *cost,
    --			      int *column2row, int *row2column,
    --			      int *v)
    -+			      intmax_t *cost,
    -+			      intmax_t *column2row, intmax_t *row2column,
    -+			      intmax_t *v)
    - {
    --	int i, j;
    -+	intmax_t i, j;
    - 
    - 	/* column reduction */
    - 	for (j = column_count - 1; j >= 0; j--) {
    --		int i1 = 0;
    -+		intmax_t i1 = 0;
    - 
    - 		for (i = 1; i < row_count; i++)
    - 			if (COST(j, i1) > COST(j, i))
    -@@ linear-assignment.c: static void columns_reduction(size_t column_count, size_t row_count,
    - }
    - 
    - static void reduction_transfer(size_t column_count, size_t row_count,
    --			       int *cost,
    --			       int *free_row, int *free_count,
    --			       int *column2row, int *row2column,
    --			       int *v)
    -+			       intmax_t *cost,
    -+			       intmax_t *free_row, intmax_t *free_count,
    -+			       intmax_t *column2row, intmax_t *row2column,
    -+			       intmax_t *v)
    - {
    --	int i, j;
    -+	intmax_t i, j;
    - 
    - 	/* reduction transfer */
    - 	for (i = 0; i < row_count; i++) {
    --		int j1 = row2column[i];
    -+		intmax_t j1 = row2column[i];
    - 		if (j1 == -1)
    - 			free_row[(*free_count)++] = i;
    - 		else if (j1 < -1)
    - 			row2column[i] = -2 - j1;
    - 		else {
    --			int min = COST(!j1, i) - v[!j1];
    -+			intmax_t min = COST(!j1, i) - v[!j1];
    - 			for (j = 1; j < column_count; j++)
    - 				if (j != j1 && min > COST(j, i) - v[j])
    - 					min = COST(j, i) - v[j];
    -@@ linear-assignment.c: static void reduction_transfer(size_t column_count, size_t row_count,
    - }
    - 
    - static void augmenting_row_reduction(size_t column_count,
    --				     int *cost,
    --				     int *column2row, int *row2column,
    --				     int *free_row, int *free_count, int *saved_free_count,
    --				     int *v)
    -+				     intmax_t *cost,
    -+				     intmax_t *column2row, intmax_t *row2column,
    -+				     intmax_t *free_row, intmax_t *free_count, intmax_t *saved_free_count,
    -+				     intmax_t *v)
    - {
    - 	int phase;
    - 
    - 	/* augmenting row reduction */
    - 	for (phase = 0; phase < 2; phase++) {
    --		int i;
    --		int k = 0;
    -+		intmax_t i;
    -+		intmax_t k = 0;
    - 
    - 		*saved_free_count = *free_count;
    - 		*free_count = 0;
    - 		while (k < *saved_free_count) {
    --			int j;
    --			int u1, u2;
    --			int j1 = 0, j2, i0;
    -+			intmax_t j;
    -+			intmax_t u1, u2;
    -+			intmax_t j1 = 0, j2, i0;
    - 
    - 			i = free_row[k++];
    - 			u1 = COST(j1, i) - v[j1];
    - 			j2 = -1;
    --			u2 = INT_MAX;
    -+			u2 = INTMAX_MAX;
    - 			for (j = 1; j < column_count; j++) {
    --				int c = COST(j, i) - v[j];
    -+				intmax_t c = COST(j, i) - v[j];
    - 				if (u2 > c) {
    - 					if (u1 < c) {
    - 						u2 = c;
    -@@ linear-assignment.c: static void augmenting_row_reduction(size_t column_count,
    - }
    - 
    - static void augmentation(size_t column_count,
    --			 int *cost,
    --			 int *column2row, int *row2column,
    --			 int *free_row, int free_count,
    --			 int *v)
    -+			 intmax_t *cost,
    -+			 intmax_t *column2row, intmax_t *row2column,
    -+			 intmax_t *free_row, intmax_t free_count,
    -+			 intmax_t *v)
    - {
    --	int i, j;
    --	int *d;
    --	int *pred, *col;
    --	int saved_free_count;
    -+	intmax_t i, j;
    -+	intmax_t *d;
    -+	intmax_t *pred, *col;
    -+	intmax_t saved_free_count;
    - 
    - 	/* augmentation */
    - 	saved_free_count = free_count;
    -@@ linear-assignment.c: static void augmentation(size_t column_count,
    - 	ALLOC_ARRAY(pred, column_count);
    - 	ALLOC_ARRAY(col, column_count);
    - 	for (free_count = 0; free_count < saved_free_count; free_count++) {
    --		int i1 = free_row[free_count], low = 0, up = 0, last, k;
    --		int min, c, u1;
    -+		intmax_t i1 = free_row[free_count], low = 0, up = 0, last, k;
    -+		intmax_t min, c, u1;
    - 
    - 		for (j = 0; j < column_count; j++) {
    - 			d[j] = COST(j, i1) - v[j];
    -@@ linear-assignment.c: static void augmentation(size_t column_count,
    - 
    - 			/* scan a row */
    - 			do {
    --				int j1 = col[low++];
    -+				intmax_t j1 = col[low++];
    - 
    - 				i = column2row[j1];
    - 				u1 = COST(j1, i) - v[j1] - min;
    -@@ linear-assignment.c: static void augmentation(size_t column_count,
    - update:
    - 		/* updating of the column pieces */
    - 		for (k = 0; k < last; k++) {
    --			int j1 = col[k];
    -+			intmax_t j1 = col[k];
    - 			v[j1] += d[j1] - min;
    - 		}
    - 
    - 		/* augmentation */
    - 		do {
    - 			if (j < 0)
    --				BUG("negative j: %d", j);
    -+				BUG("negative j: %"PRIuMAX, j);
    - 			i = pred[j];
    - 			column2row[j] = i;
    - 			SWAP(j, row2column[i]);
    -@@ linear-assignment.c: static void augmentation(size_t column_count,
    -  * i is `cost[j + column_count * i].
    -  */
    - void compute_assignment(size_t column_count, size_t row_count,
    --			int *cost,
    --			int *column2row, int *row2column)
    -+			intmax_t *cost,
    -+			intmax_t *column2row, intmax_t *row2column)
    - {
    --	int *v;
    --	int *free_row, free_count = 0, saved_free_count;
    -+	intmax_t *v;
    -+	intmax_t *free_row, free_count = 0, saved_free_count;
    - 
    - 	assert(column_count > 1);
    --	memset(column2row, -1, sizeof(int) * column_count);
    --	memset(row2column, -1, sizeof(int) * row_count);
    -+	memset(column2row, -1, sizeof(intmax_t) * column_count);
    -+	memset(row2column, -1, sizeof(intmax_t) * row_count);
    - 	ALLOC_ARRAY(v, column_count);
    - 
    - 	columns_reduction(column_count, row_count, cost, column2row,
    -
    - ## linear-assignment.h ##
    -@@
    -  * row_count).
    -  */
    - void compute_assignment(size_t column_count, size_t row_count,
    --			int *cost,
    --			int *column2row, int *row2column);
    --
    --/* The maximal cost in the cost matrix (to prevent integer overflows). */
    --#define COST_MAX (1<<16)
    --
    -+			intmax_t *cost,
    -+			intmax_t *column2row, intmax_t *row2column);
    - #endif
    -
      ## range-diff.c ##
     @@ range-diff.c: static int diffsize(const char *a, const char *b)
    - 		return count;
    - 
    - 	error(_("failed to generate diff"));
    --	return COST_MAX;
    -+	return INT_MAX;
    - }
    - 
      static void get_correspondences(struct string_list *a, struct string_list *b,
      				int creation_factor)
      {
    - 	size_t n = st_add(a->nr, b->nr);
    --	int *cost, c, *a2b, *b2a;
    -+	intmax_t *cost, c, *a2b, *b2a;
    - 	size_t i, j;
    - 
    - 	CALLOC_ARRAY(cost, st_mult(n, n));
    - 	CALLOC_ARRAY(a2b, n);
    - 	CALLOC_ARRAY(b2a, n);
    +-	int n = a->nr + b->nr;
    ++	int column_count = st_add(a->nr, b->nr);
    + 	int *cost, c, *a2b, *b2a;
    + 	int i, j;
    + 
    +-	CALLOC_ARRAY(cost, st_mult(n, n));
    +-	CALLOC_ARRAY(a2b, n);
    +-	CALLOC_ARRAY(b2a, n);
    ++	CALLOC_ARRAY(cost, st_mult(column_count, column_count));
    ++	CALLOC_ARRAY(a2b, column_count);
    ++	CALLOC_ARRAY(b2a, column_count);

    -+
      	for (i = 0; i < a->nr; i++) {
      		struct patch_util *a_util = a->items[i].util;
    - 
     @@ range-diff.c: static void get_correspondences(struct string_list *a, struct string_list *b,
    - 			else if (a_util->matching < 0 && b_util->matching < 0)
      				c = diffsize(a_util->diff, b_util->diff);
      			else
    --				c = COST_MAX;
    -+				c = INT_MAX;
    - 			cost[i + n * j] = c;
    + 				c = COST_MAX;
    +-			cost[i + n * j] = c;
    ++			cost[i + column_count * j] = c;
      		}

      		c = a_util->matching < 0 ?
    --			a_util->diffsize * creation_factor / 100 : COST_MAX;
    -+			a_util->diffsize * creation_factor / 100 : INT_MAX;
    - 		for (j = b->nr; j < n; j++)
    - 			cost[i + n * j] = c;
    + 			a_util->diffsize * creation_factor / 100 : COST_MAX;
    +-		for (j = b->nr; j < n; j++)
    +-			cost[i + n * j] = c;
    ++		for (j = b->nr; j < column_count; j++)
    ++			cost[i + column_count * j] = c;
      	}
    + 
    + 	for (j = 0; j < b->nr; j++) {
     @@ range-diff.c: static void get_correspondences(struct string_list *a, struct string_list *b,
    - 		struct patch_util *util = b->items[j].util;

      		c = util->matching < 0 ?
    --			util->diffsize * creation_factor / 100 : COST_MAX;
    -+			util->diffsize * creation_factor / 100 : INT_MAX;
    - 		for (i = a->nr; i < n; i++)
    - 			cost[i + n * j] = c;
    + 			util->diffsize * creation_factor / 100 : COST_MAX;
    +-		for (i = a->nr; i < n; i++)
    +-			cost[i + n * j] = c;
    ++		for (i = a->nr; i < column_count; i++)
    ++			cost[i + column_count * j] = c;
      	}
    + 
    +-	if (n > 1)
    +-		compute_assignment(n, n, cost, a2b, b2a);
    ++	if (column_count > 1)
    ++		compute_assignment(column_count, column_count, cost, a2b, b2a);
    + 
    + 	for (i = 0; i < a->nr; i++)
    + 		if (a2b[i] >= 0 && a2b[i] < b->nr) {
 7:  a82771413f7 !  5:  9194965635a linear-assignment.c: convert a macro to a "static inline" function
    @@ Metadata
     Author: Ævar Arnfjörð Bjarmason <avarab@xxxxxxxxx>

      ## Commit message ##
    -    linear-assignment.c: convert a macro to a "static inline" function
    +    range-diff: fix integer overflow & segfault on cost[i + n * j]

    -    Change the COST() macro to be a "static inline" function. On GCC this
    -    makes no difference in performance, but this improves the readability
    -    of the function. In a subsequent commit we'll make use of this to
    -    extend this function with overflow detection.
    +    in preceding commits the "column_count" and the "int*"'s we malloc()
    +    were changed to track their length with a size_t, so we're able to
    +    track as many "cost" items as malloc() will give us.
    +
    +    But we'd still segfault on relatively large range comparisons,
    +    e.g. this would segfault:
    +
    +        git -P range-diff --creation-factor=50 origin/master...git-for-windows/main
    +
    +    The reason for that is that we'd still use integer types to compute an
    +    array index into the "cost" array, which would overflow. The result of
    +    a signed overflow in C is undefined, but on my system it'll result in
    +    a negative number, and a prompt segfault as we'll try to access a
    +    negative array index.
    +
    +    Luckily we used the COST() macro in linear-assignment.c already for
    +    all of these lookups, and in a preceding commit we renamed "n" in
    +    "range-diff.c"'s get_correspondences() to "column_count" in
    +    preparation for using it here.
    +
    +    So let's use it for the three occurrences of "cost" indexing in
    +    range-diff.c, and have the COST() macro itself do overflow checking
    +    with st_mult() and st_add(). Due to the cast to "size_t" from "int"
    +    we'll avoid the segfault, and will end up correctly pointing to the
    +    relevant "int *".
    +
    +    It's not important that we use the new cost_offset() inline function
    +    here, we could also use the st_*() macros inline. By using it we'll
    +    get a more meaningful backtrace in a debugger to the relevant
    +    addition/multiplication line if we end up calling die() here.
    +
    +    It's still possible for us to overflow even with this change, that's
    +    because the iteration variables (such as "i" and "j" in this diff
    +    context are all "int"), even if we changed those to "size_t" or
    +    "intmax_t" (not trivial, as we depend on them being negative in some
    +    places) the underlying "struct string_list"'s "nr" member is an
    +    "unsigned int", which would eventually overflow.
    +
    +    However the danger of that overflow isn't as great, as we were
    +    overflowing on "i + column_count * j" before this change, it'll
    +    require a much bigger range for us to have an integer overflow on the
    +    number of commits we're processing.
    +
    +    We're unlikely to encounter a 2-4 billion commit history on 32 bit
    +    platforms. Even if we did one of the types in the underlying object
    +    machinery would probably overflow before we overflowed here. So let's
    +    punt that for now. If we're ever going to solve that issue [1] to
    +    change the "struct string_list"'s "nr" member to a "size_t" might be a
    +    good start.
    +
    +    1. https://lore.kernel.org/git/RFC-patch-01.10-7c929096381-20211209T191653Z-avarab@xxxxxxxxx/

         Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@xxxxxxxxx>

    @@ linear-assignment.c
      #include "linear-assignment.h"

     -#define COST(column, row) cost[(column) + column_count * (row)]
    -+static inline int cost_index(int *cost, int a, int b, int c)
    +-
    + static void columns_reduction(size_t column_count, size_t row_count,
    + 			      int *cost,
    + 			      int *column2row, int *row2column,
    +
    + ## linear-assignment.h ##
    +@@ linear-assignment.h: void compute_assignment(size_t column_count, size_t row_count,
    + 			int *cost,
    + 			int *column2row, int *row2column);
    + 
    ++/**
    ++ * Get an overflow-proof offset into the "cost" array.
    ++ */
    ++static inline size_t cost_offset(const size_t column,
    ++				 const size_t column_count, const size_t row)
     +{
    -+	int r;
    ++	const size_t a = st_mult(column_count, row);
    ++	const size_t b = st_add(column, a);
     +
    -+	r = b + a * c;
    -+
    -+	return r;
    ++	return b;
     +}
     +
    -+#define COST(column, row) cost[cost_index(cost, column_count, column, row)]
    ++/**
    ++ * Convenience macro for doing the cost[] lookup using cost_offset().
    ++ */
    ++#define COST(column, row) cost[cost_offset((column), (column_count), (row))]
    ++
    + /* The maximal cost in the cost matrix (to prevent integer overflows). */
    + #define COST_MAX (1<<16)

    - static void columns_reduction(size_t column_count, size_t row_count,
    - 			      int *cost,
    +
    + ## range-diff.c ##
    +@@ range-diff.c: static void get_correspondences(struct string_list *a, struct string_list *b,
    + 				c = diffsize(a_util->diff, b_util->diff);
    + 			else
    + 				c = COST_MAX;
    +-			cost[i + column_count * j] = c;
    ++			COST(i, j) = c;
    + 		}
    + 
    + 		c = a_util->matching < 0 ?
    + 			a_util->diffsize * creation_factor / 100 : COST_MAX;
    + 		for (j = b->nr; j < column_count; j++)
    +-			cost[i + column_count * j] = c;
    ++			COST(i, j) = c;
    + 	}
    + 
    + 	for (j = 0; j < b->nr; j++) {
    +@@ range-diff.c: static void get_correspondences(struct string_list *a, struct string_list *b,
    + 		c = util->matching < 0 ?
    + 			util->diffsize * creation_factor / 100 : COST_MAX;
    + 		for (i = a->nr; i < column_count; i++)
    +-			cost[i + column_count * j] = c;
    ++			COST(i, j) = c;
    + 	}
    + 
    + 	if (column_count > 1)
 8:  794d494bedd <  -:  ----------- linear-assignment.c: detect signed add/mul on GCC and Clang
 9:  2026b4bff90 <  -:  ----------- linear-assignment.c: add and use intprops.h from Gnulib
-- 
2.34.1.932.g36842105b61