Re: Creating objects manually and repack

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Shawn Pearce <spearce@xxxxxxxxxxx> wrote:
> Jon Smirl <jonsmirl@xxxxxxxxx> wrote:
> > On 8/5/06, Martin Langhoff <martin.langhoff@xxxxxxxxx> wrote:
> > >On 8/5/06, Jon Smirl <jonsmirl@xxxxxxxxx> wrote:
> > >> On 8/4/06, Linus Torvalds <torvalds@xxxxxxxx> wrote:
> > >> > and you're basically all done. The above would turn each *,v file into 
> > >a
> > >> > *-<sha>.pack/*-<sha>.idx file pair, so you'd have exactly as many
> > >> > pack-files as you have *,v files.
> > >>
> > >> I'll end up with 110,000 pack files.
> > >
> > >Then just do it every 100 files, and you'll only have 1,100 pack
> > >files, and it'll be fine.
> > 
> > This is something that has to be tuned. If you wait too long
> > everything spills out of RAM and you go totally IO bound for days. If
> > you do it too often you end up with too many packs and it takes a day
> > to repack them.
> > 
> > If I had a way to pipe the all of the objects into repack one at a
> > time without repack doing multiple passes none of this tuning would be
> > necessary. In this model the standalone objects never get created in
> > the first place. The fastest IO is IO that has been eliminated.
> 
> I'm almost done with what I'm calling `git-fast-import`.

OK, now I'm done.  I'm attaching the code.  Toss it into the Makefile
as git-fast-import and recompile.

I tested it with the following Perl script, feeding the Perl script
a list of files that I wanted blobs for on STDIN:

	while (<>) {
		chop;
		print pack('L', -s $_);
		open(F, $_);
		my $buf;
		print $buf while read(F,$buf,128*1024) > 0;
		close F;
	}

This gave me an execution order of:

	find . -name '*.c' | perl test.pl | git-fast-import in.pack
	git-index-pack in.pack

at which point in.pack claims to be a completely valid pack with an
index of in.idx.  Move these into .git/objects/pack, generate trees
and commits, and run git-repack -a -d.  If the order you feed the
objects to git-fast-import in is reasonable (do one RCS file at a
time, feed most recent to least recent revisions) you may not get
any major benefit from using -f during your final repack.

The code for git-fast-import could probably be tweaked to accept
trees and commits too, which would permit you to stream the entire
CVS repository into a single pack file.  :-)

I can't help you decompress the RCS files faster, but hopefully
this will help you generate the GIT pack faster.  Hopefully you
can make use of it!

-- 
Shawn.
#include "builtin.h"
#include "cache.h"
#include "object.h"
#include "blob.h"
#include "delta.h"
#include "pack.h"
#include "csum-file.h"

static int max_depth = 10;
static unsigned long object_count;
static int packfd;
static int current_depth;
static void *lastdat;
static unsigned long lastdatlen;
static unsigned char lastsha1[20];

static ssize_t yread(int fd, void *buffer, size_t length)
{
	ssize_t ret = 0;
	while (ret < length) {
		ssize_t size = xread(fd, (char *) buffer + ret, length - ret);
		if (size < 0) {
			return size;
		}
		if (size == 0) {
			return ret;
		}
		ret += size;
	}
	return ret;
}

static ssize_t ywrite(int fd, void *buffer, size_t length)
{
	ssize_t ret = 0;
	while (ret < length) {
		ssize_t size = xwrite(fd, (char *) buffer + ret, length - ret);
		if (size < 0) {
			return size;
		}
		if (size == 0) {
			return ret;
		}
		ret += size;
	}
	return ret;
}

static unsigned long encode_header(enum object_type type, unsigned long size, unsigned char *hdr)
{
	int n = 1;
	unsigned char c;

	if (type < OBJ_COMMIT || type > OBJ_DELTA)
		die("bad type %d", type);

	c = (type << 4) | (size & 15);
	size >>= 4;
	while (size) {
		*hdr++ = c | 0x80;
		c = size & 0x7f;
		size >>= 7;
		n++;
	}
	*hdr = c;
	return n;
}

static void write_blob (void *dat, unsigned long datlen)
{
	z_stream s;
	void *out, *delta;
	unsigned char hdr[64];
	unsigned long hdrlen, deltalen;

	if (lastdat && current_depth < max_depth) {
		delta = diff_delta(lastdat, lastdatlen,
			dat, datlen,
			&deltalen, 0);
	} else
		delta = 0;

	memset(&s, 0, sizeof(s));
	deflateInit(&s, zlib_compression_level);

	if (delta) {
		current_depth++;
		s.next_in = delta;
		s.avail_in = deltalen;
		hdrlen = encode_header(OBJ_DELTA, deltalen, hdr);
		if (ywrite(packfd, hdr, hdrlen) != hdrlen)
			die("Can't write object header: %s", strerror(errno));
		if (ywrite(packfd, lastsha1, sizeof(lastsha1)) != sizeof(lastsha1))
			die("Can't write object base: %s", strerror(errno));
	} else {
		current_depth = 0;
		s.next_in = dat;
		s.avail_in = datlen;
		hdrlen = encode_header(OBJ_BLOB, datlen, hdr);
		if (ywrite(packfd, hdr, hdrlen) != hdrlen)
			die("Can't write object header: %s", strerror(errno));
	}

	s.avail_out = deflateBound(&s, s.avail_in);
	s.next_out = out = xmalloc(s.avail_out);
	while (deflate(&s, Z_FINISH) == Z_OK)
		/* nothing */;
	deflateEnd(&s);

	if (ywrite(packfd, out, s.total_out) != s.total_out)
		die("Failed writing compressed data %s", strerror(errno));

	free(out);
	if (delta)
		free(delta);
}

static void init_pack_header ()
{
	const char* magic = "PACK";
	unsigned long version = 2;
	unsigned long zero = 0;

	version = htonl(version);

	if (ywrite(packfd, (char*)magic, 4) != 4)
		die("Can't write pack magic: %s", strerror(errno));
	if (ywrite(packfd, &version, 4) != 4)
		die("Can't write pack version: %s", strerror(errno));
	if (ywrite(packfd, &zero, 4) != 4)
		die("Can't write 0 object count: %s", strerror(errno));
}

static void fixup_header_footer ()
{
	SHA_CTX c;
	char hdr[8];
	unsigned char sha1[20];
	unsigned long cnt;
	char *buf;
	size_t n;

	if (lseek(packfd, 0, SEEK_SET) != 0)
		die("Failed seeking to start: %s", strerror(errno));

	SHA1_Init(&c);
	if (yread(packfd, hdr, 8) != 8)
		die("Failed reading header: %s", strerror(errno));
	SHA1_Update(&c, hdr, 8);

fprintf(stderr, "%lu objects\n", object_count);
	cnt = htonl(object_count);
	SHA1_Update(&c, &cnt, 4);
	if (ywrite(packfd, &cnt, 4) != 4)
		die("Failed writing object count: %s", strerror(errno));

	buf = xmalloc(128 * 1024);
	for (;;) {
		n = xread(packfd, buf, 128 * 1024);
		if (n <= 0)
			break;
		SHA1_Update(&c, buf, n);
	}
	free(buf);

	SHA1_Final(sha1, &c);
	if (ywrite(packfd, sha1, sizeof(sha1)) != sizeof(sha1))
		die("Failed writing pack checksum: %s", strerror(errno));
}

int main (int argc, const char **argv)
{
	packfd = open(argv[1], O_RDWR|O_CREAT|O_TRUNC, 0666);
	if (packfd < 0)
		die("Can't create pack file %s: %s", argv[1], strerror(errno));

	init_pack_header();
	for (;;) {
		unsigned long datlen;
		int hdrlen;
		void *dat;
		char hdr[128];
		unsigned char sha1[20];
		SHA_CTX c;

		if (yread(0, &datlen, 4) != 4)
			break;

		dat = xmalloc(datlen);
		if (yread(0, dat, datlen) != datlen)
			break;

		hdrlen = sprintf(hdr, "blob %lu", datlen) + 1;
		SHA1_Init(&c);
		SHA1_Update(&c, hdr, hdrlen);
		SHA1_Update(&c, dat, datlen);
		SHA1_Final(sha1, &c);

		write_blob(dat, datlen);
		object_count++;
		printf("%s\n", sha1_to_hex(sha1));
		fflush(stdout);

		if (lastdat)
			free(lastdat);
		lastdat = dat;
		lastdatlen = datlen;
		memcpy(lastsha1, sha1, sizeof(sha1));
	}
	fixup_header_footer();
	close(packfd);

	return 0;
}

[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]