Re: test-tool: bloom: generate_filter for multiple string?

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 2021-01-12 14:53:32-0500, Taylor Blau <me@xxxxxxxxxxxx> wrote:
> On Thu, Dec 31, 2020 at 10:54:38AM +0700, Đoàn Trần Công Danh wrote:
> > I'm reading the code for Bloom Filter to see if arXiv:2012.00472
> > could be an improvement.
> 
> I'm late to the party, but I'm curious to hear which part of this
> article you think would help out the Bloom filter implementation.

Uhm, no. The article doesn't help the Bloom filter implementation.
The article was suggesting using Bloom filter to speed-up the
negotiation in fetch-pack and upload-pack. Which, in my own quick
experience, doesn't help much. Maybe it's me not understand the
article idea or I have made a naive implementation. However, I'm not
convinced to pursued further.

If you are curious, I'm attaching 2 quick-and-low-quality patches with
this email for your consideration.

-- 
Danh
>From 287d1b94606bbc5475909e5298560cbd28ee998e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C4=90o=C3=A0n=20Tr=E1=BA=A7n=20C=C3=B4ng=20Danh?=
 <congdanhqx@xxxxxxxxx>
Date: Thu, 31 Dec 2020 10:58:22 +0700
Subject: [PATCH 2/3] fetch-pack: implent bloom filter in client side
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Đoàn Trần Công Danh <congdanhqx@xxxxxxxxx>
---
 fetch-pack.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 81 insertions(+), 4 deletions(-)

diff --git a/fetch-pack.c b/fetch-pack.c
index 876f90c759..f423ccd816 100644
--- a/fetch-pack.c
+++ b/fetch-pack.c
@@ -23,6 +23,9 @@
 #include "fetch-negotiator.h"
 #include "fsck.h"
 #include "shallow.h"
+#include "bloom.h"
+#include "list-objects.h"
+#include "revision.h"
 
 static int transfer_unpack_limit = -1;
 static int fetch_unpack_limit = -1;
@@ -1184,6 +1187,71 @@ static int add_haves(struct fetch_negotiator *negotiator,
 	return ret;
 }
 
+static int rev_info_insert_ref_oid(const char *refname,
+				   const struct object_id *oid,
+				   int flag, void *cbdata)
+{
+	struct rev_info *rev_info = cbdata;
+	add_pending_oid(rev_info, refname, oid, 0);
+	return 1;
+}
+
+static void rev_info_insert_to_oidset(struct commit *commit, void *cbdata)
+{
+	struct oidset *set = cbdata;
+	oidset_insert(set, &commit->object.oid);
+}
+
+static void add_bloom(struct strbuf *req_buf, struct oidset *common)
+{
+	struct oidset_iter iter;
+	struct oidset haves = OIDSET_INIT;
+	struct rev_info rev_info;
+	struct bloom_filter filter;
+	struct bloom_filter_settings settings = DEFAULT_BLOOM_FILTER_SETTINGS;
+	struct strbuf buf;
+	const struct object_id *oid;
+	int i;
+
+	if (oidset_size(common) == 0)
+		return;
+	
+	repo_init_revisions(the_repository, &rev_info, NULL);
+
+	oidset_iter_init(common, &iter);
+	while ((oid = oidset_iter_next(&iter))) {
+		add_pending_oid(&rev_info, oid_to_hex(oid), oid,
+			       UNINTERESTING | BOTTOM);
+	}
+
+	for_each_rawref(rev_info_insert_ref_oid, &rev_info);
+	if (prepare_revision_walk(&rev_info)) {
+		warning("couldn't prepare revision walk for fetch");
+		return;
+	}
+	traverse_commit_list(&rev_info, rev_info_insert_to_oidset, NULL, &haves);
+	if (oidset_size(&haves) == 0)
+		return;
+	filter.len = (oidset_size(&haves) * settings.bits_per_entry + BITS_PER_WORD - 1)
+		/ BITS_PER_WORD;
+	free(filter.data);
+	filter.data = xcalloc(filter.len, sizeof(unsigned char));
+	oidset_iter_init(&haves, &iter);
+	while ((oid = oidset_iter_next(&iter))) {
+		struct bloom_key key;
+		fill_bloom_key((const char *)oid->hash, the_hash_algo->rawsz,
+			       &key, &settings);
+		add_key_to_filter(&key, &filter, &settings);
+	}
+
+	strbuf_init(&buf, filter.len * 2 + strlen("bloom \n"));
+	strbuf_addstr(&buf, "bloom ");
+	for (i = 0; i < filter.len; i++)
+		strbuf_addf(&buf, "%02x", filter.data[i]);
+	strbuf_addch(&buf, '\n');
+	packet_buf_write_len(req_buf, buf.buf, buf.len);
+}
+
 static int send_fetch_request(struct fetch_negotiator *negotiator, int fd_out,
 			      struct fetch_pack_args *args,
 			      const struct ref *wants, struct oidset *common,
@@ -1191,6 +1259,7 @@ static int send_fetch_request(struct fetch_negotiator *negotiator, int fd_out,
 			      int sideband_all, int seen_ack)
 {
 	int ret = 0;
+	int transferbloom = 0;
 	const char *hash_name;
 	struct strbuf req_buf = STRBUF_INIT;
 
@@ -1278,6 +1347,11 @@ static int send_fetch_request(struct fetch_negotiator *negotiator, int fd_out,
 	ret = add_haves(negotiator, seen_ack, &req_buf,
 			haves_to_send, in_vain);
 
+	/* Add bloom filter represent commits we have from known common */
+	git_config_get_maybe_bool("transfer.bloom", &transferbloom);
+	if (transferbloom && server_supports_v2("bloom", 0))
+		add_bloom(&req_buf, common);
+
 	/* Send request */
 	packet_buf_flush(&req_buf);
 	if (write_in_full(fd_out, req_buf.buf, req_buf.len) < 0)
@@ -1352,11 +1426,14 @@ static enum common_found process_acks(struct fetch_negotiator *negotiator,
 			struct object_id oid;
 			received_ack = 1;
 			if (!get_oid_hex(arg, &oid)) {
+				struct object *obj;
 				struct commit *commit;
-				oidset_insert(common, &oid);
-				commit = lookup_commit(the_repository, &oid);
-				if (negotiator)
-					negotiator->ack(negotiator, commit);
+				obj = lookup_object(the_repository, &oid);
+				if (obj && (commit = object_as_type(obj, OBJ_COMMIT, 0))) {
+					oidset_insert(common, &oid);
+					if (negotiator)
+						negotiator->ack(negotiator, commit);
+				}
 			}
 			continue;
 		}
-- 
2.30.0.4.gbe3229325d

>From 2c93a1a6c37d40ba37eb083a139b6f1312a547e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C4=90o=C3=A0n=20Tr=E1=BA=A7n=20C=C3=B4ng=20Danh?=
 <congdanhqx@xxxxxxxxx>
Date: Mon, 4 Jan 2021 20:23:29 +0700
Subject: [PATCH 3/3] upload-pack: support bloom filter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Đoàn Trần Công Danh <congdanhqx@xxxxxxxxx>
---
 serve.c              |  1 +
 t/t5701-git-serve.sh |  1 +
 upload-pack.c        | 86 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 88 insertions(+)

diff --git a/serve.c b/serve.c
index eec2fe6f29..daea53bfb7 100644
--- a/serve.c
+++ b/serve.c
@@ -78,6 +78,7 @@ static struct protocol_capability capabilities[] = {
 	{ "server-option", always_advertise, NULL },
 	{ "object-format", object_format_advertise, NULL },
 	{ "session-id", session_id_advertise, NULL },
+	{ "bloom", always_advertise, NULL },
 };
 
 static void advertise_capabilities(void)
diff --git a/t/t5701-git-serve.sh b/t/t5701-git-serve.sh
index a1f5fdc9fd..9fabdba6b1 100755
--- a/t/t5701-git-serve.sh
+++ b/t/t5701-git-serve.sh
@@ -16,6 +16,7 @@ test_expect_success 'test capability advertisement' '
 	fetch=shallow
 	server-option
 	object-format=$(test_oid algo)
+	bloom
 	0000
 	EOF
 
diff --git a/upload-pack.c b/upload-pack.c
index 3b66bf92ba..0eff165865 100644
--- a/upload-pack.c
+++ b/upload-pack.c
@@ -27,6 +27,7 @@
 #include "commit-graph.h"
 #include "commit-reach.h"
 #include "shallow.h"
+#include "bloom.h"
 
 /* Remember to update object flag allocation in object.h */
 #define THEY_HAVE	(1u << 11)
@@ -62,6 +63,8 @@ struct upload_pack_data {
 	struct object_array have_obj;
 	struct oid_array haves;					/* v2 only */
 	struct string_list wanted_refs;				/* v2 only */
+	struct bloom_filter bloom_filter;			/* v2 only */
+	struct bloom_filter_settings bloom_filter_settings;
 
 	struct object_array shallows;
 	struct string_list deepen_not;
@@ -113,6 +116,13 @@ struct upload_pack_data {
 	unsigned advertise_sid : 1;
 };
 
+struct upload_pack_bloom_args
+{
+	struct bloom_filter *bloom_filter;
+	struct bloom_filter_settings *bloom_filter_settings;
+	struct oid_array *acks;
+};
+
 static void upload_pack_data_init(struct upload_pack_data *data)
 {
 	struct string_list symref = STRING_LIST_INIT_DUP;
@@ -125,6 +135,7 @@ static void upload_pack_data_init(struct upload_pack_data *data)
 	struct string_list uri_protocols = STRING_LIST_INIT_DUP;
 	struct object_array extra_edge_obj = OBJECT_ARRAY_INIT;
 	struct string_list allowed_filters = STRING_LIST_INIT_DUP;
+	struct bloom_filter_settings bloom_filter_settings = DEFAULT_BLOOM_FILTER_SETTINGS;
 
 	memset(data, 0, sizeof(*data));
 	data->symref = symref;
@@ -132,6 +143,7 @@ static void upload_pack_data_init(struct upload_pack_data *data)
 	data->want_obj = want_obj;
 	data->have_obj = have_obj;
 	data->haves = haves;
+	data->bloom_filter_settings = bloom_filter_settings;
 	data->shallows = shallows;
 	data->deepen_not = deepen_not;
 	data->uri_protocols = uri_protocols;
@@ -1464,6 +1476,34 @@ static int parse_have(const char *line, struct oid_array *haves)
 	return 0;
 }
 
+static int parse_bloom(const char *line, struct bloom_filter *filter,
+		       struct bloom_filter_settings *settings)
+{
+	const char *arg;
+	unsigned char *p;
+	size_t len;
+	if (!skip_prefix(line, "bloom ", &arg))
+		return 0;
+
+	len = strlen(arg);
+	if (len % 2)
+		die("git upload-pack: corrupted bloom data with len: %zu", len);
+	filter->len = len / 2;
+	filter->data = p = xmalloc(filter->len);
+	while (*arg) {
+		int val = hex2chr(arg);
+		if (val < 0)
+			goto errout;
+		*p++ = val;
+		arg += 2;
+	}
+	return 1;
+errout:
+	filter->len = 0;
+	FREE_AND_NULL(filter->data);
+	return 0;
+}
+
 static void process_args(struct packet_reader *request,
 			 struct upload_pack_data *data)
 {
@@ -1482,6 +1522,9 @@ static void process_args(struct packet_reader *request,
 		if (parse_have(arg, &data->haves))
 			continue;
 
+		if (parse_bloom(arg, &data->bloom_filter, &data->bloom_filter_settings))
+			continue;
+
 		/* process args like thin-pack */
 		if (!strcmp(arg, "thin-pack")) {
 			data->use_thin_pack = 1;
@@ -1570,6 +1613,48 @@ static int process_haves(struct upload_pack_data *data, struct oid_array *common
 	return 0;
 }
 
+static void rev_info_bloom_ack(struct commit *commit, void *cbdata)
+{
+	struct upload_pack_bloom_args *args = cbdata;
+	struct bloom_key key;
+	struct object_id *oid = &commit->object.oid;
+	fill_bloom_key((const char *)oid->hash, the_hash_algo->rawsz,
+		       &key, args->bloom_filter_settings);
+	if (bloom_filter_contains(args->bloom_filter, &key,
+				  args->bloom_filter_settings))
+		oid_array_append(args->acks, oid);
+}
+
+/* Walking from wanted_obj until haves, add all MAYBE objects to acks */
+static int process_bloom(struct upload_pack_data *data, struct oid_array *acks)
+{
+	struct upload_pack_bloom_args args;
+	struct rev_info rev_info;
+	const struct object_id *oid;
+	int i;
+
+	/* unknown bloom data and/or no known common objects */
+	if (!data->bloom_filter.len || !acks->nr)
+		return 0;
+	repo_init_revisions(the_repository, &rev_info, NULL);
+	for (i = 0; i < acks->nr; i++) {
+		oid = &acks->oid[i];
+		add_pending_oid(&rev_info, oid_to_hex(oid), oid,
+				UNINTERESTING | BOTTOM);
+	}
+	for (i = 0; i < data->want_obj.nr; i++) {
+		oid = &data->want_obj.objects[i].item->oid;
+		add_pending_oid(&rev_info, oid_to_hex(oid), oid, 0);
+	}
+	if (prepare_revision_walk(&rev_info))
+		return 0;
+	args.bloom_filter = &data->bloom_filter;
+	args.bloom_filter_settings = &data->bloom_filter_settings;
+	args.acks = acks;
+	traverse_commit_list(&rev_info, rev_info_bloom_ack, NULL, &args);
+	return 0;
+}
+
 static int send_acks(struct upload_pack_data *data, struct oid_array *acks)
 {
 	int i;
@@ -1600,6 +1685,7 @@ static int process_haves_and_send_acks(struct upload_pack_data *data)
 	int ret = 0;
 
 	process_haves(data, &common);
+	process_bloom(data, &common);
 	if (data->done) {
 		ret = 1;
 	} else if (send_acks(data, &common)) {
-- 
2.30.0.4.gbe3229325d


[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]

  Powered by Linux