On 2021-01-12 14:53:32-0500, Taylor Blau <me@xxxxxxxxxxxx> wrote: > On Thu, Dec 31, 2020 at 10:54:38AM +0700, Đoàn Trần Công Danh wrote: > > I'm reading the code for Bloom Filter to see if arXiv:2012.00472 > > could be an improvement. > > I'm late to the party, but I'm curious to hear which part of this > article you think would help out the Bloom filter implementation. Uhm, no. The article doesn't help the Bloom filter implementation. The article was suggesting using Bloom filter to speed-up the negotiation in fetch-pack and upload-pack. Which, in my own quick experience, doesn't help much. Maybe it's me not understand the article idea or I have made a naive implementation. However, I'm not convinced to pursued further. If you are curious, I'm attaching 2 quick-and-low-quality patches with this email for your consideration. -- Danh
>From 287d1b94606bbc5475909e5298560cbd28ee998e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C4=90o=C3=A0n=20Tr=E1=BA=A7n=20C=C3=B4ng=20Danh?= <congdanhqx@xxxxxxxxx> Date: Thu, 31 Dec 2020 10:58:22 +0700 Subject: [PATCH 2/3] fetch-pack: implent bloom filter in client side MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Đoàn Trần Công Danh <congdanhqx@xxxxxxxxx> --- fetch-pack.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 81 insertions(+), 4 deletions(-) diff --git a/fetch-pack.c b/fetch-pack.c index 876f90c759..f423ccd816 100644 --- a/fetch-pack.c +++ b/fetch-pack.c @@ -23,6 +23,9 @@ #include "fetch-negotiator.h" #include "fsck.h" #include "shallow.h" +#include "bloom.h" +#include "list-objects.h" +#include "revision.h" static int transfer_unpack_limit = -1; static int fetch_unpack_limit = -1; @@ -1184,6 +1187,71 @@ static int add_haves(struct fetch_negotiator *negotiator, return ret; } +static int rev_info_insert_ref_oid(const char *refname, + const struct object_id *oid, + int flag, void *cbdata) +{ + struct rev_info *rev_info = cbdata; + add_pending_oid(rev_info, refname, oid, 0); + return 1; +} + +static void rev_info_insert_to_oidset(struct commit *commit, void *cbdata) +{ + struct oidset *set = cbdata; + oidset_insert(set, &commit->object.oid); +} + +static void add_bloom(struct strbuf *req_buf, struct oidset *common) +{ + struct oidset_iter iter; + struct oidset haves = OIDSET_INIT; + struct rev_info rev_info; + struct bloom_filter filter; + struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS; + struct strbuf buf; + const struct object_id *oid; + int i; + + if (oidset_size(common) == 0) + return; + + repo_init_revisions(the_repository, &rev_info, NULL); + + oidset_iter_init(common, &iter); + while ((oid = oidset_iter_next(&iter))) { + add_pending_oid(&rev_info, oid_to_hex(oid), oid, + UNINTERESTING | BOTTOM); + } + + for_each_rawref(rev_info_insert_ref_oid, &rev_info); + if (prepare_revision_walk(&rev_info)) { + warning("couldn't prepare revision walk for fetch"); + return; + } + traverse_commit_list(&rev_info, rev_info_insert_to_oidset, NULL, &haves); + if (oidset_size(&haves) == 0) + return; + filter.len = (oidset_size(&haves) * settings.bits_per_entry + BITS_PER_WORD - 1) + / BITS_PER_WORD; + free(filter.data); + filter.data = xcalloc(filter.len, sizeof(unsigned char)); + oidset_iter_init(&haves, &iter); + while ((oid = oidset_iter_next(&iter))) { + struct bloom_key key; + fill_bloom_key((const char *)oid->hash, the_hash_algo->rawsz, + &key, &settings); + add_key_to_filter(&key, &filter, &settings); + } + + strbuf_init(&buf, filter.len * 2 + strlen("bloom \n")); + strbuf_addstr(&buf, "bloom "); + for (i = 0; i < filter.len; i++) + strbuf_addf(&buf, "%02x", filter.data[i]); + strbuf_addch(&buf, '\n'); + packet_buf_write_len(req_buf, buf.buf, buf.len); +} + static int send_fetch_request(struct fetch_negotiator *negotiator, int fd_out, struct fetch_pack_args *args, const struct ref *wants, struct oidset *common, @@ -1191,6 +1259,7 @@ static int send_fetch_request(struct fetch_negotiator *negotiator, int fd_out, int sideband_all, int seen_ack) { int ret = 0; + int transferbloom = 0; const char *hash_name; struct strbuf req_buf = STRBUF_INIT; @@ -1278,6 +1347,11 @@ static int send_fetch_request(struct fetch_negotiator *negotiator, int fd_out, ret = add_haves(negotiator, seen_ack, &req_buf, haves_to_send, in_vain); + /* Add bloom filter represent commits we have from known common */ + git_config_get_maybe_bool("transfer.bloom", &transferbloom); + if (transferbloom && server_supports_v2("bloom", 0)) + add_bloom(&req_buf, common); + /* Send request */ packet_buf_flush(&req_buf); if (write_in_full(fd_out, req_buf.buf, req_buf.len) < 0) @@ -1352,11 +1426,14 @@ static enum common_found process_acks(struct fetch_negotiator *negotiator, struct object_id oid; received_ack = 1; if (!get_oid_hex(arg, &oid)) { + struct object *obj; struct commit *commit; - oidset_insert(common, &oid); - commit = lookup_commit(the_repository, &oid); - if (negotiator) - negotiator->ack(negotiator, commit); + obj = lookup_object(the_repository, &oid); + if (obj && (commit = object_as_type(obj, OBJ_COMMIT, 0))) { + oidset_insert(common, &oid); + if (negotiator) + negotiator->ack(negotiator, commit); + } } continue; } -- 2.30.0.4.gbe3229325d
>From 2c93a1a6c37d40ba37eb083a139b6f1312a547e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C4=90o=C3=A0n=20Tr=E1=BA=A7n=20C=C3=B4ng=20Danh?= <congdanhqx@xxxxxxxxx> Date: Mon, 4 Jan 2021 20:23:29 +0700 Subject: [PATCH 3/3] upload-pack: support bloom filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Đoàn Trần Công Danh <congdanhqx@xxxxxxxxx> --- serve.c | 1 + t/t5701-git-serve.sh | 1 + upload-pack.c | 86 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+) diff --git a/serve.c b/serve.c index eec2fe6f29..daea53bfb7 100644 --- a/serve.c +++ b/serve.c @@ -78,6 +78,7 @@ static struct protocol_capability capabilities[] = { { "server-option", always_advertise, NULL }, { "object-format", object_format_advertise, NULL }, { "session-id", session_id_advertise, NULL }, + { "bloom", always_advertise, NULL }, }; static void advertise_capabilities(void) diff --git a/t/t5701-git-serve.sh b/t/t5701-git-serve.sh index a1f5fdc9fd..9fabdba6b1 100755 --- a/t/t5701-git-serve.sh +++ b/t/t5701-git-serve.sh @@ -16,6 +16,7 @@ test_expect_success 'test capability advertisement' ' fetch=shallow server-option object-format=$(test_oid algo) + bloom 0000 EOF diff --git a/upload-pack.c b/upload-pack.c index 3b66bf92ba..0eff165865 100644 --- a/upload-pack.c +++ b/upload-pack.c @@ -27,6 +27,7 @@ #include "commit-graph.h" #include "commit-reach.h" #include "shallow.h" +#include "bloom.h" /* Remember to update object flag allocation in object.h */ #define THEY_HAVE (1u << 11) @@ -62,6 +63,8 @@ struct upload_pack_data { struct object_array have_obj; struct oid_array haves; /* v2 only */ struct string_list wanted_refs; /* v2 only */ + struct bloom_filter bloom_filter; /* v2 only */ + struct bloom_filter_settings bloom_filter_settings; struct object_array shallows; struct string_list deepen_not; @@ -113,6 +116,13 @@ struct upload_pack_data { unsigned advertise_sid : 1; }; +struct upload_pack_bloom_args +{ + struct bloom_filter *bloom_filter; + struct bloom_filter_settings *bloom_filter_settings; + struct oid_array *acks; +}; + static void upload_pack_data_init(struct upload_pack_data *data) { struct string_list symref = STRING_LIST_INIT_DUP; @@ -125,6 +135,7 @@ static void upload_pack_data_init(struct upload_pack_data *data) struct string_list uri_protocols = STRING_LIST_INIT_DUP; struct object_array extra_edge_obj = OBJECT_ARRAY_INIT; struct string_list allowed_filters = STRING_LIST_INIT_DUP; + struct bloom_filter_settings bloom_filter_settings = DEFAULT_BLOOM_FILTER_SETTINGS; memset(data, 0, sizeof(*data)); data->symref = symref; @@ -132,6 +143,7 @@ static void upload_pack_data_init(struct upload_pack_data *data) data->want_obj = want_obj; data->have_obj = have_obj; data->haves = haves; + data->bloom_filter_settings = bloom_filter_settings; data->shallows = shallows; data->deepen_not = deepen_not; data->uri_protocols = uri_protocols; @@ -1464,6 +1476,34 @@ static int parse_have(const char *line, struct oid_array *haves) return 0; } +static int parse_bloom(const char *line, struct bloom_filter *filter, + struct bloom_filter_settings *settings) +{ + const char *arg; + unsigned char *p; + size_t len; + if (!skip_prefix(line, "bloom ", &arg)) + return 0; + + len = strlen(arg); + if (len % 2) + die("git upload-pack: corrupted bloom data with len: %zu", len); + filter->len = len / 2; + filter->data = p = xmalloc(filter->len); + while (*arg) { + int val = hex2chr(arg); + if (val < 0) + goto errout; + *p++ = val; + arg += 2; + } + return 1; +errout: + filter->len = 0; + FREE_AND_NULL(filter->data); + return 0; +} + static void process_args(struct packet_reader *request, struct upload_pack_data *data) { @@ -1482,6 +1522,9 @@ static void process_args(struct packet_reader *request, if (parse_have(arg, &data->haves)) continue; + if (parse_bloom(arg, &data->bloom_filter, &data->bloom_filter_settings)) + continue; + /* process args like thin-pack */ if (!strcmp(arg, "thin-pack")) { data->use_thin_pack = 1; @@ -1570,6 +1613,48 @@ static int process_haves(struct upload_pack_data *data, struct oid_array *common return 0; } +static void rev_info_bloom_ack(struct commit *commit, void *cbdata) +{ + struct upload_pack_bloom_args *args = cbdata; + struct bloom_key key; + struct object_id *oid = &commit->object.oid; + fill_bloom_key((const char *)oid->hash, the_hash_algo->rawsz, + &key, args->bloom_filter_settings); + if (bloom_filter_contains(args->bloom_filter, &key, + args->bloom_filter_settings)) + oid_array_append(args->acks, oid); +} + +/* Walking from wanted_obj until haves, add all MAYBE objects to acks */ +static int process_bloom(struct upload_pack_data *data, struct oid_array *acks) +{ + struct upload_pack_bloom_args args; + struct rev_info rev_info; + const struct object_id *oid; + int i; + + /* unknown bloom data and/or no known common objects */ + if (!data->bloom_filter.len || !acks->nr) + return 0; + repo_init_revisions(the_repository, &rev_info, NULL); + for (i = 0; i < acks->nr; i++) { + oid = &acks->oid[i]; + add_pending_oid(&rev_info, oid_to_hex(oid), oid, + UNINTERESTING | BOTTOM); + } + for (i = 0; i < data->want_obj.nr; i++) { + oid = &data->want_obj.objects[i].item->oid; + add_pending_oid(&rev_info, oid_to_hex(oid), oid, 0); + } + if (prepare_revision_walk(&rev_info)) + return 0; + args.bloom_filter = &data->bloom_filter; + args.bloom_filter_settings = &data->bloom_filter_settings; + args.acks = acks; + traverse_commit_list(&rev_info, rev_info_bloom_ack, NULL, &args); + return 0; +} + static int send_acks(struct upload_pack_data *data, struct oid_array *acks) { int i; @@ -1600,6 +1685,7 @@ static int process_haves_and_send_acks(struct upload_pack_data *data) int ret = 0; process_haves(data, &common); + process_bloom(data, &common); if (data->done) { ret = 1; } else if (send_acks(data, &common)) { -- 2.30.0.4.gbe3229325d