[] One general question up here, more comments inline. The current order for a clean-filter is like this, I removed the error handling: int convert_to_git() { ret |= apply_filter(path, src, len, -1, dst, filter); if (ret && dst) { src = dst->buf; len = dst->len; } ret |= crlf_to_git(path, src, len, dst, ca.crlf_action, checksafe); return ret | ident_to_git(path, src, len, dst, ca.ident); } The first step is the clean filter, the CRLF-LF conversion (if needed), then ident. The current implementation streams the whole file content to the filter, (STDIN of the filter) and reads back STDOUT from the filter into a STRBUF. This is to use the UNIX-like STDIN--STDOUT method for writing a filter. However, int would_convert_to_git_filter_fd() and convert_to_git_filter_fd() offer a sort of short-cut: The filter reads from the file directly, and the output of the filter is read into a STRBUF. It looks as if the multi-filter approach can use this in a similar way: Give the pathname to the filter, the filter opens the file for reading and stream the result via the pkt-line protocol into Git. This needs some more changes, and may be very well go into a separate patch series. (and should). What I am asking for: When a multi-filter is used, the content is handled to the filter via pkt-line, and the result is given to Git via pkt-line ? Nothing wrong with it, I just wonder, if it should be mentioned somewhere. > diff --git a/contrib/long-running-filter/example.pl b/contrib/long-running-filter/example.pl > new file mode 100755 > index 0000000..279fbfb > --- /dev/null > +++ b/contrib/long-running-filter/example.pl > @@ -0,0 +1,111 @@ > +#!/usr/bin/perl > +# > +# Example implementation for the Git filter protocol version 2 > +# See Documentation/gitattributes.txt, section "Filter Protocol" > +# > + > +use strict; > +use warnings; > + > +my $MAX_PACKET_CONTENT_SIZE = 65516; > + > +sub packet_read { > + my $buffer; > + my $bytes_read = read STDIN, $buffer, 4; > + if ( $bytes_read == 0 ) { > + > + # EOF - Git stopped talking to us! > + exit(); > + } > + elsif ( $bytes_read != 4 ) { > + die "invalid packet size '$bytes_read' field"; > + } This is half-kosher, I would say, (And I really. really would like to see an implementation in C ;-) A read function may look like this: ret = read(0, &buffer, 4); if (ret < 0) { /* Error, nothing we can do */ exit(1); } else if (ret == 0) { /* EOF */ exit(0); } else if (ret < 4) { /* * Go and read more, until we have 4 bytes or EOF or Error */ } else { /* Good case, see below */ } > + my $pkt_size = hex($buffer); > + if ( $pkt_size == 0 ) { > + return ( 1, "" ); > + } > + elsif ( $pkt_size > 4 ) { > + my $content_size = $pkt_size - 4; > + $bytes_read = read STDIN, $buffer, $content_size; > + if ( $bytes_read != $content_size ) { > + die "invalid packet ($content_size expected; $bytes_read read)"; > + } > + return ( 0, $buffer ); > + } > + else { > + die "invalid packet size"; > + } > +} > + > +sub packet_write { > + my ($packet) = @_; > + print STDOUT sprintf( "%04x", length($packet) + 4 ); > + print STDOUT $packet; > + STDOUT->flush(); > +} > + > +sub packet_flush { > + print STDOUT sprintf( "%04x", 0 ); > + STDOUT->flush(); > +} > + > +( packet_read() eq ( 0, "git-filter-client" ) ) || die "bad initialization"; > +( packet_read() eq ( 0, "version=2" ) ) || die "bad version"; > +( packet_read() eq ( 1, "" ) ) || die "bad version end"; > + > +packet_write("git-filter-server\n"); > +packet_write("version=2\n"); > + > +( packet_read() eq ( 0, "clean=true" ) ) || die "bad capability"; > +( packet_read() eq ( 0, "smudge=true" ) ) || die "bad capability"; > +( packet_read() eq ( 1, "" ) ) || die "bad capability end"; > + > +packet_write( "clean=true\n" ); > +packet_write( "smudge=true\n" ); > +packet_flush(); > + > +while (1) { > + my ($command) = packet_read() =~ /^command=([^=]+)\n$/; > + my ($pathname) = packet_read() =~ /^pathname=([^=]+)\n$/; > + > + packet_read(); > + > + my $input = ""; > + { > + binmode(STDIN); > + my $buffer; > + my $done = 0; > + while ( !$done ) { > + ( $done, $buffer ) = packet_read(); > + $input .= $buffer; > + } > + } > + > + my $output; > + if ( $command eq "clean" ) { > + ### Perform clean here ### > + $output = $input; > + } > + elsif ( $command eq "smudge" ) { > + ### Perform smudge here ### > + $output = $input; > + } > + else { > + die "bad command '$command'"; > + } > + > + packet_write("status=success\n"); > + packet_flush(); > + while ( length($output) > 0 ) { > + my $packet = substr( $output, 0, $MAX_PACKET_CONTENT_SIZE ); > + packet_write($packet); > + if ( length($output) > $MAX_PACKET_CONTENT_SIZE ) { > + $output = substr( $output, $MAX_PACKET_CONTENT_SIZE ); > + } > + else { > + $output = ""; > + } > + } > + packet_flush(); # flush content! > + packet_flush(); # empty list! > +} > diff --git a/convert.c b/convert.c > index a068df7..0ed48ed 100644 > --- a/convert.c > +++ b/convert.c > @@ -3,6 +3,7 @@ > #include "run-command.h" > #include "quote.h" > #include "sigchain.h" > +#include "pkt-line.h" > > /* > * convert.c - convert a file when checking it out and checking it in. > @@ -442,7 +443,7 @@ static int filter_buffer_or_fd(int in, int out, void *data) > return (write_err || status); > } > > -static int apply_filter(const char *path, const char *src, size_t len, int fd, > +static int apply_single_file_filter(const char *path, const char *src, size_t len, int fd, > struct strbuf *dst, const char *cmd) > { > /* > @@ -456,12 +457,6 @@ static int apply_filter(const char *path, const char *src, size_t len, int fd, > struct async async; > struct filter_params params; > > - if (!cmd || !*cmd) > - return 0; > - > - if (!dst) > - return 1; > - > memset(&async, 0, sizeof(async)); > async.proc = filter_buffer_or_fd; > async.data = ¶ms; > @@ -496,14 +491,318 @@ static int apply_filter(const char *path, const char *src, size_t len, int fd, > return !err; > } > > +#define CAP_CLEAN (1u<<0) > +#define CAP_SMUDGE (1u<<1) Is CAP_ too generic, and GIT_FILTER_CAP (or so) less calling for trouble ? > + > +struct cmd2process { > + struct hashmap_entry ent; /* must be the first member! */ > + int supported_capabilities; > + const char *cmd; > + struct child_process process; > +}; > + > +static int cmd_process_map_initialized; > +static struct hashmap cmd_process_map; > + > +static int cmd2process_cmp(const struct cmd2process *e1, > + const struct cmd2process *e2, > + const void *unused) > +{ > + return strcmp(e1->cmd, e2->cmd); > +} > + > +static struct cmd2process *find_multi_file_filter_entry(struct hashmap *hashmap, const char *cmd) > +{ > + struct cmd2process key; > + hashmap_entry_init(&key, strhash(cmd)); > + key.cmd = cmd; > + return hashmap_get(hashmap, &key, NULL); > +} > + > +static void kill_multi_file_filter(struct hashmap *hashmap, struct cmd2process *entry) > +{ > + if (!entry) > + return; > + sigchain_push(SIGPIPE, SIG_IGN); > + /* > + * We kill the filter most likely because an error happened already. > + * That's why we are not interested in any error code here. > + */ > + close(entry->process.in); > + close(entry->process.out); > + sigchain_pop(SIGPIPE); > + finish_command(&entry->process); > + hashmap_remove(hashmap, entry, NULL); > + free(entry); > +} > + > +static int packet_write_list(int fd, const char *line, ...) > +{ > + va_list args; > + int err; > + va_start(args, line); > + for (;;) > + { > + if (!line) > + break; > + if (strlen(line) > PKTLINE_DATA_MAXLEN) > + return -1; > + err = packet_write_fmt_gently(fd, "%s", line); > + if (err) > + return err; > + line = va_arg(args, const char*); > + } > + va_end(args); > + return packet_flush_gently(fd); > +} > + > +static struct cmd2process *start_multi_file_filter(struct hashmap *hashmap, const char *cmd) > +{ > + int err; > + struct cmd2process *entry; > + struct child_process *process; > + const char *argv[] = { cmd, NULL }; > + struct string_list cap_list = STRING_LIST_INIT_NODUP; > + char *cap_buf; > + const char *cap_name; > + > + entry = xmalloc(sizeof(*entry)); > + hashmap_entry_init(entry, strhash(cmd)); > + entry->cmd = cmd; > + entry->supported_capabilities = 0; > + process = &entry->process; > + > + child_process_init(process); > + process->argv = argv; > + process->use_shell = 1; > + process->in = -1; > + process->out = -1; > + > + if (start_command(process)) { > + error("cannot fork to run external filter '%s'", cmd); > + kill_multi_file_filter(hashmap, entry); > + return NULL; > + } > + > + sigchain_push(SIGPIPE, SIG_IGN); > + > + err = packet_write_list(process->in, "git-filter-client", "version=2", NULL); > + if (err) > + goto done; > + > + err = strcmp(packet_read_line(process->out, NULL), "git-filter-server"); > + if (err) { > + error("external filter '%s' does not support long running filter protocol", cmd); > + goto done; > + } > + err = strcmp(packet_read_line(process->out, NULL), "version=2"); > + if (err) > + goto done; > + > + err = packet_write_list(process->in, "clean=true", "smudge=true", NULL); > + > + for (;;) > + { > + cap_buf = packet_read_line(process->out, NULL); > + if (!cap_buf) > + break; > + string_list_split_in_place(&cap_list, cap_buf, '=', 1); > + > + if (cap_list.nr != 2 || strcmp(cap_list.items[1].string, "true")) > + continue; > + > + cap_name = cap_list.items[0].string; > + if (!strcmp(cap_name, "clean")) { > + entry->supported_capabilities |= CAP_CLEAN; > + } else if (!strcmp(cap_name, "smudge")) { > + entry->supported_capabilities |= CAP_SMUDGE; > + } else { > + warning( > + "external filter '%s' requested unsupported filter capability '%s'", > + cmd, cap_name > + ); > + } > + > + string_list_clear(&cap_list, 0); > + } > + > +done: > + sigchain_pop(SIGPIPE); > + > + if (err || errno == EPIPE) { > + error("initialization for external filter '%s' failed", cmd); > + kill_multi_file_filter(hashmap, entry); > + return NULL; > + } > + > + hashmap_add(hashmap, entry); > + return entry; > +} > + > +static void read_multi_file_filter_values(int fd, struct strbuf *status) { > + struct strbuf **pair; > + char *line; > + for (;;) { > + line = packet_read_line(fd, NULL); > + if (!line) > + break; > + pair = strbuf_split_str(line, '=', 2); > + if (pair[0] && pair[0]->len && pair[1]) { > + if (!strcmp(pair[0]->buf, "status=")) { > + strbuf_reset(status); > + strbuf_addbuf(status, pair[1]); > + } > + } > + } > +} > + > +static int apply_multi_file_filter(const char *path, const char *src, size_t len, > + int fd, struct strbuf *dst, const char *cmd, > + const int wanted_capability) > +{ > + int err; > + struct cmd2process *entry; > + struct child_process *process; > + struct stat file_stat; > + struct strbuf nbuf = STRBUF_INIT; > + struct strbuf filter_status = STRBUF_INIT; > + char *filter_type; > + > + if (!cmd_process_map_initialized) { > + cmd_process_map_initialized = 1; > + hashmap_init(&cmd_process_map, (hashmap_cmp_fn) cmd2process_cmp, 0); > + entry = NULL; > + } else { > + entry = find_multi_file_filter_entry(&cmd_process_map, cmd); > + } > + > + fflush(NULL); > + > + if (!entry) { > + entry = start_multi_file_filter(&cmd_process_map, cmd); > + if (!entry) > + return 0; > + } > + process = &entry->process; > + > + if (!(wanted_capability & entry->supported_capabilities)) > + return 0; > + > + if (CAP_CLEAN & wanted_capability) > + filter_type = "clean"; > + else if (CAP_SMUDGE & wanted_capability) > + filter_type = "smudge"; > + else > + die("unexpected filter type"); > + > + if (fd >= 0 && !src) { > + if (fstat(fd, &file_stat) == -1) > + return 0; > + len = xsize_t(file_stat.st_size); > + } > + > + sigchain_push(SIGPIPE, SIG_IGN); > + > + > + err = (strlen(filter_type) > PKTLINE_DATA_MAXLEN); Extra () needed ? More () in the code... No more comments today (except that the kill is overkill)