Re: [PATCH 3/6] run-command: add an "ungroup" option to run_process_parallel()

Junio C Hamano <gitster@xxxxxxxxx> · Fri, 22 Apr 2022 20:54:14 -0700

Ævar Arnfjörð Bjarmason  <avarab@xxxxxxxxx> writes:

> @@ -1494,6 +1494,7 @@ struct parallel_processes {
>  	struct pollfd *pfd;
>  
>  	unsigned shutdown : 1;
> +	unsigned ungroup:1;

Match the style with the above (either with or without SP
consistently, I would choose to match existing one if I were doing
this myself).

> @@ -1537,8 +1538,9 @@ static void pp_init(struct parallel_processes *pp,
>  		    get_next_task_fn get_next_task,
>  		    start_failure_fn start_failure,
>  		    task_finished_fn task_finished,
> -		    void *data)
> +		    void *data, struct run_process_parallel_opts *opts)
>  {
> +	const int ungroup = opts->ungroup;
>  	int i;
>  
>  	if (n < 1)
> @@ -1556,16 +1558,22 @@ static void pp_init(struct parallel_processes *pp,
>  	pp->start_failure = start_failure ? start_failure : default_start_failure;
>  	pp->task_finished = task_finished ? task_finished : default_task_finished;
>  
> +	pp->ungroup = ungroup;
> +

OK, now this makes it clear that the new structure introduced in the
first step is about run_process_parallel() and not about trace2, so
it would probably make sense to go back to that step and throw these
*_fn callbacks and callback state to the structure, too.

>  	pp->nr_processes = 0;
>  	pp->output_owner = 0;
>  	pp->shutdown = 0;
>  	CALLOC_ARRAY(pp->children, n);
> -	CALLOC_ARRAY(pp->pfd, n);
> +	if (!ungroup)
> +		CALLOC_ARRAY(pp->pfd, n);

OK, we will not poll under ungroup option, so we do not need pfd[]
in that case.  It would be cleaner to clear pp->pfd = NULL if not
done already when ungroup is in effect.

> +
>  	strbuf_init(&pp->buffered_output, 0);
>  
>  	for (i = 0; i < n; i++) {
>  		strbuf_init(&pp->children[i].err, 0);
>  		child_process_init(&pp->children[i].process);
> +		if (ungroup)
> +			continue;
>  		pp->pfd[i].events = POLLIN | POLLHUP;
>  		pp->pfd[i].fd = -1;
>  	}

This does not make practical difference _right_ _now_, but as a
general code hygiene discipline, it would be more future-proof not
to rely on "ungroup" being the _only_ thing that allows us to omit
allocating pfd.  IOW, conditional allocation of pp->pfd based on
ungroup before the loop is perfectly fine, but inside the loop, it
would be better to say "if pp->pfd is not there, no matter the
reason why pp->pfd is missing, we refrain from filling the array
because we are not polling".

		if (!pp->pfd)
			continue;

We may know that the only reason we decided not to poll is with the
ungroup bit in the current code, but we do not have to depend on the
knowledge.  The only thing we need to know, in order to refrain from
setting POLLIN/POLLHUP bits, is that we decided that we will not poll,
and the decision should be more directly found in pp->pfd than inferring
what the ungroup says.

> @@ -1576,6 +1584,7 @@ static void pp_init(struct parallel_processes *pp,
>  
>  static void pp_cleanup(struct parallel_processes *pp)
>  {
> +	const int ungroup = pp->ungroup;
>  	int i;
>  
>  	trace_printf("run_processes_parallel: done");
> @@ -1585,14 +1594,17 @@ static void pp_cleanup(struct parallel_processes *pp)
>  	}
>  
>  	free(pp->children);
> -	free(pp->pfd);
> +	if (!ungroup)
> +		free(pp->pfd);

Likewise, the NULLness of pp->pfd should be what matters.

>  	/*
>  	 * When get_next_task added messages to the buffer in its last
>  	 * iteration, the buffered output is non empty.
>  	 */
> -	strbuf_write(&pp->buffered_output, stderr);
> -	strbuf_release(&pp->buffered_output);
> +	if (!ungroup) {
> +		strbuf_write(&pp->buffered_output, stderr);
> +		strbuf_release(&pp->buffered_output);
> +	}

OK, this need to happen only when we are buffering.

>  	sigchain_pop_common();
>  }
> @@ -1606,6 +1618,7 @@ static void pp_cleanup(struct parallel_processes *pp)
>   */
>  static int pp_start_one(struct parallel_processes *pp)
>  {
> +	const int ungroup = pp->ungroup;
>  	int i, code;
>  
>  	for (i = 0; i < pp->max_processes; i++)
> @@ -1615,24 +1628,31 @@ static int pp_start_one(struct parallel_processes *pp)
>  		BUG("bookkeeping is hard");
>  
>  	code = pp->get_next_task(&pp->children[i].process,
> -				 &pp->children[i].err,
> +				 ungroup ? NULL : &pp->children[i].err,

OK.

>  				 pp->data,
>  				 &pp->children[i].data);
>  	if (!code) {
> -		strbuf_addbuf(&pp->buffered_output, &pp->children[i].err);
> -		strbuf_reset(&pp->children[i].err);
> +		if (!ungroup) {
> +			strbuf_addbuf(&pp->buffered_output, &pp->children[i].err);
> +			strbuf_reset(&pp->children[i].err);
> +		}
>  		return 1;
>  	}
> -	pp->children[i].process.err = -1;
> -	pp->children[i].process.stdout_to_stderr = 1;
> -	pp->children[i].process.no_stdin = 1;
> +
> +	if (!ungroup) {
> +		pp->children[i].process.err = -1;
> +		pp->children[i].process.stdout_to_stderr = 1;
> +		pp->children[i].process.no_stdin = 1;
> +	}

OK, except for .no_stdin bit.  Even before the "we started using the
parallel running API to drive hooks, losing the direct access to the
real standard output from the hooks" regression, we didn't expose the
input side to the hooks.  run_hook_ve() did set .no_stdin and I do
not think we want to change that with "--ungroup".  If there is any
reason why you needed not to keep .no_stdin bit set in the ungroup
mode, deviating from what the code before the regression did, that
needs to be explained (but I suspect this was simply a bug in this
round of the patch, not a deliberate behaviour change).

>  	if (start_command(&pp->children[i].process)) {
> -		code = pp->start_failure(&pp->children[i].err,
> +		code = pp->start_failure(ungroup ? NULL : &pp->children[i].err,
>  					 pp->data,
>  					 pp->children[i].data);
> -		strbuf_addbuf(&pp->buffered_output, &pp->children[i].err);
> -		strbuf_reset(&pp->children[i].err);
> +		if (!ungroup) {
> +			strbuf_addbuf(&pp->buffered_output, &pp->children[i].err);
> +			strbuf_reset(&pp->children[i].err);
> +		}
>  		if (code)
>  			pp->shutdown = 1;
>  		return code;
> @@ -1640,14 +1660,26 @@ static int pp_start_one(struct parallel_processes *pp)
>  
>  	pp->nr_processes++;
>  	pp->children[i].state = GIT_CP_WORKING;
> -	pp->pfd[i].fd = pp->children[i].process.err;
> +	if (!ungroup)
> +		pp->pfd[i].fd = pp->children[i].process.err;
>  	return 0;
>  }
>  
> +static void pp_mark_working_for_cleanup(struct parallel_processes *pp)
> +{
> +	int i;
> +
> +	for (i = 0; i < pp->max_processes; i++)
> +		if (pp->children[i].state == GIT_CP_WORKING)
> +			pp->children[i].state = GIT_CP_WAIT_CLEANUP;
> +}
> +
>  static void pp_buffer_stderr(struct parallel_processes *pp, int output_timeout)
>  {
>  	int i;
>  
> +	assert(!pp->ungroup);
> +
>  	while ((i = poll(pp->pfd, pp->max_processes, output_timeout)) < 0) {
>  		if (errno == EINTR)
>  			continue;
> @@ -1674,6 +1706,9 @@ static void pp_buffer_stderr(struct parallel_processes *pp, int output_timeout)
>  static void pp_output(struct parallel_processes *pp)
>  {
>  	int i = pp->output_owner;
> +
> +	assert(!pp->ungroup);
> +
>  	if (pp->children[i].state == GIT_CP_WORKING &&
>  	    pp->children[i].err.len) {
>  		strbuf_write(&pp->children[i].err, stderr);
> @@ -1683,10 +1718,15 @@ static void pp_output(struct parallel_processes *pp)
>  
>  static int pp_collect_finished(struct parallel_processes *pp)
>  {
> +	const int ungroup = pp->ungroup;
>  	int i, code;
>  	int n = pp->max_processes;
>  	int result = 0;
>  
> +	if (ungroup)
> +		for (i = 0; i < pp->max_processes; i++)
> +			pp->children[i].state = GIT_CP_WAIT_CLEANUP;
> +
>  	while (pp->nr_processes > 0) {
>  		for (i = 0; i < pp->max_processes; i++)
>  			if (pp->children[i].state == GIT_CP_WAIT_CLEANUP)
> @@ -1697,8 +1737,8 @@ static int pp_collect_finished(struct parallel_processes *pp)
>  		code = finish_command(&pp->children[i].process);
>  
>  		code = pp->task_finished(code,
> -					 &pp->children[i].err, pp->data,
> -					 pp->children[i].data);
> +					 ungroup ? NULL : &pp->children[i].err,
> +					 pp->data, pp->children[i].data);
>  
>  		if (code)
>  			result = code;
> @@ -1707,10 +1747,13 @@ static int pp_collect_finished(struct parallel_processes *pp)
>  
>  		pp->nr_processes--;
>  		pp->children[i].state = GIT_CP_FREE;
> -		pp->pfd[i].fd = -1;
> +		if (!ungroup)
> +			pp->pfd[i].fd = -1;
>  		child_process_init(&pp->children[i].process);
>  
> -		if (i != pp->output_owner) {
> +		if (ungroup) {
> +			/* no strbuf_*() work to do here */

Make it a habit to keep ";" semicolon when writing an empty
statement, i.e.

			; /* some comments */

This will help when other else/if body becomes shorter and we can
lose the {} around here.

I cannot quite shake the feeling that this step is doing so much
only because it wants to coax "run in parallel" infrastructure to do
what it is not suited to do, i.e. drive hooks that wants to directly
face the end-user output channel, and it might make a conceptually
cleaner fix to simply revert the root cause, but it is getting late
so...