Re: Git files data formats documentation

A Large Angry SCM <gitzilla@xxxxxxxxx> · Sat, 05 Aug 2006 15:35:51 -0700

A Large Angry SCM wrote:
This information may be useful for reading and writing the various Git 
files.

Revised.

Git files data formats
======================

OBJECTS
-------
# The object ID, or "name", of an object is
#	_sha-1_digest_( <OBJECT_HEADER> <object_CONTENTS> ).

<BLOB>
	:	_deflate_( <OBJECT_HEADER> <BLOB_CONTENTS> )
	|	<COMPACT_OBJECT_HEADER> _deflate_( <BLOB_CONTENTS> )
	;

<BLOB_CONTENTS>
	:	<DATA>
	;

<TREE>
	:	_deflate_( <OBJECT_HEADER> <TREE_CONTENTS> )
	|	<COMPACT_OBJECT_HEADER> _deflate_( <TREE_CONTENTS> )
	;

<TREE_CONTENTS>
	:	<TREE_ENTRIES>
	;

<TREE_ENTRIES>
	# Tree entries are sorted by the byte sequence that comprises
	# the entry name. However, for the purposes of the sort
	# comparison, entries for tree objects are compared as if the
	# entry name byte sequence has a trailing ASCII '/' (0x2f).
	:	( <TREE_ENTRY> )*
	;

<TREE_ENTRY>
	# The type of the object referenced MUST be appropriate for
	# the mode. Regular files and symbolic links reference a BLOB
	# and directories reference a TREE.
	:	<OCTAL_MODE> <SP> <NAME> <NUL> <BINARY_OBJ_ID>
	;

<COMMIT>
	:	_deflate_( <OBJECT_HEADER> <COMMIT_CONTENTS> )
	|	<COMPACT_OBJECT_HEADER> _deflate_( <COMMIT_CONTENTS> )
	;

<COMMIT_CONTENTS>
	:	"tree" <SP> <HEX_OBJ_ID> <LF>
		( "parent" <SP> <HEX_OBJ_ID> <LF> )*
		"author" <SP>
			<SAFE_NAME> <SP>
			<LT> <SAFE_EMAIL> <GT> <SP>
			<GIT_DATE> <LF>
		"committer" <SP>
			<SAFE_NAME> <SP>
			<LT> <SAFE_EMAIL> <GT> <SP>
			<GIT_DATE> <LF>
		<LF>
		<DATA>
	;

<TAG>
	:	_deflate_( <OBJECT_HEADER> <TAG_CONTENTS> )
	|	<COMPACT_OBJECT_HEADER> _deflate_( <TAG_CONTENTS> )
	;

<TAG_CONTENTS>
	:	"object" <SP> <HEX_OBJ_ID> <LF>
		"type" <SP> <OBJ_TYPE> <LF>
		"tag" <SP> <TAG_NAME> <LF>
		<LF>
		<DATA>
	;

<OBJECT_HEADER>
	:	<OBJ_TYPE> <SP> <DECIMAL_LENGTH> <NUL>
	;

<COMPACT_OBJECT_HEADER>
	# The object type DELTA_ENCODED is not valid in a
	# <COMPACT_OBJECT_HEADER>.
	:	<TYPE_AND_BASE128_SIZE>
	;

<DATA>
	# Uninterpreted sequence of bytes.
	;

<OCTAL_MODE>
	# ASCII encoding of the octal encoding, without prefix or
	# leading zeros, of the file system object type and permission
	# bits. The bit layout is according to the POSIX standard, with
	# only regular files, directories, and symbolic links
	# permitted. The actual permission bits are all zero except for
	# regular files. The only permission bit of any consequence to
	# Git is the owner executable bit. By default, the permission
	# bits for files will be either 0644 or 0755, depending on the
	# owner executable bit.
	;

<NAME>
	# Sequence of bytes not containing the ASCII character byte
	# values NUL (0x00) or "/" (0x2f).
	;

<BINARY_OBJ_ID>
	# The object ID of the referenced object.
	;

<HEX_OBJ_ID>
	# Hexidecimal encoding (lower case) of the <BINARY_OBJ_ID>.
	;

<SAFE_NAME>
	:	<SAFE_STRING> 
	;

<SAFE_EMAIL>
	:	<SAFE_STRING>
	;

<SAFE_STRING>
	# A sequence of bytes not containing the ASCII character byte
	# values NUL (0x00), LF (0x0a), '<' (0c3c), or '>' (0x3e).
	#
	# The sequence may not begin or end with any bytes with the
	# following ASCII character byte values: SPACE (0x20),
	# '.' (0x2e), ',' (0x2c), ':' (0x3a), ';' (0x3b), '<' (0x3c),
	# '>' (0x3e), '"' (0x22), "'" (0x27).
	;

<GIT_DATE>
	:	<SECONDS> <SP> <TZ_OFFSET>
	;

<SECONDS>
	# Base 10, ASCII encoding of the number of seconds since 12:00
	# midnight January 1, 1970, UTC without accounting for leap
	# seconds, and without leading zeros.
	;

<TZ_OFFSET>
	# Signed offset of time zone from UTC.
	:	<TZ_OFFSET_SIGN> <TZ_OFFSET_HOURS> <TZ_OFFSET_MIN>
	;

<TZ_OFFSET_SIGN>
	:	"+"
	|	"-"
	;

<TZ_OFFSET_HOURS>
	:	<DIGIT> <DIGIT>
	;

<TZ_OFFSET_MIN>
	# Valid values are "00" to "59" inclusive.
	:	<DIGIT> <DIGIT>
	;

<DIGIT>
	# ASCII decimal digit.
	;

<OBJ_TYPE>
	:	"blob"
	|	"tree"
	|	"commit"
	|	"tag"
	;

<DECIMAL_LENGTH>
	# Base 10, ASCII encoding of the byte length of the object
	# contents, without leading zeros. The length value does not
	# include the length of the <OBJECT_HEADER>.
	:	( <DIGIT> )+
	;

<SP>
	# ASCII SPACE (0x20) character.
	;

<NUL>
	# ASCII NUL (0x00) character.
	;

<LF>
	# ASCII LF (0x0a) "line feed" character.
	;

PACK FILE
---------
# The name of a pack file is "pack-${PACK_ID}.pack", where ${PACK_ID}
# is the hexidecimal encoding (lower case) of the SHA-1 digest of the
# sorted list of binary object IDs in the pack file without a separator
# between the object IDs. Initially, the ${PACK_ID} for a pack was not
# well defined, effectively making the value 40 random hexidecimal
# (lower case) characters. Recent, non git-core, code depends on the
# uniqueness of ${PACK_ID} across all of the object databases used by
# a repository.

<PACK_FILE>
	:	<PACK_FILE_CONTENTS> <PACK_FILE_CHECKSUM>
	;

<PACK_FILE_CONTENTS>
	:	"PACK" <PACK_VERSION> <PACK_OBJECT_COUNT>
		( <PACKED_OBJECT_HEADER> <PACKED_OBJECT_DATA> )*
		<PACK_FILE_CHECKSUM>
	;

<PACK_VERSION>
	# 32 bit, network byte order, binary integer indicating which
	# version of the pack file format was used to create the pack
	# file.
	;

<PACK_OBJECT_COUNT>
	# 32 bit, network byte order, binary integer containg the
	# number of objects encoded in the pack file.
	;

<PACK_FILE_CHECKSUM>
	:	_sha-1_digest_( <PACK_FILE_CONTENTS> )
	;

<PACKED_OBJECT_HEADER>
	# If the object type is not a DELTA_ENCODED object, the packed
	# object data that follows is the deflated byte sequence of the
	# object without the Git object header. The length value is the
	# byte count of the inflated byte sequence of the object.
	#
	# If the object type is a DELTA_ENCODED object, what follows is
	# the ID of the base object and the deflated delta data to
	# transform the base object into the target object. The type of
	# the target object is the same as that of the base object and
	# the length value is the byte count of the inflated delta
	# data. The base object may also be DELTA_ENCODED but cyclic
	# base object chains are not permitted and the pack file MUST
	# contain all base objects.
	:	<TYPE_AND_BASE128_SIZE>
	;

<TYPE_AND_BASE128_SIZE>
	# A compact, variable length, encoding of the packed object
	# length and type. The first byte is comprised of 3 fields
	# (where bit 0 is the least significant bit in a byte):
	#	bit 7:		more flag
	#	bits 6-4:	object type
	#	bits 3-0:	least significant bits of the object
	#			length.
	# If the more flag is set, the next byte contains more object
	# length bits.
	# The object types corresponding to the object type bits are:
	#	6 5 4
	#	- - -
	#	0 0 0	invalid: Reserved
	#	0 0 1	COMMIT object
	#	0 1 0	TREE object
	#	0 1 1	BLOB object
	#	1 0 0	TAG object
	#	1 0 1	invalid: Reserved
	#	1 1 0	invalid: Reserved
	#	1 1 1	DELTA_ENCODED object
	#
	# If the more flag was set, the next byte will have more length
	# bits and will be comprised to 2 fields:
	#	bit 7:		more flag
	#	bits 6-0:	7 additional, more significant, bits of
	#			the object length
	# If the more flag is set, the next byte contains more object
	# length bits using the same encoding.
	;

<PACKED_OBJECT_DATA>
	:	_deflate_( <DATA> )
	|	<BINARY_OBJ_ID> _deflate_( <DELTA_DATA> )
	;

<DELTA_DATA>
	# Size of the base object encoded as a base 128 number, least
	# significant bits first, using bit 7 (the most significant
	# bit) of each byte to indicate that more bits follow.
	#
	# Size of the result object encoded as a base 128 number, using
	# the same method as used for the base object size.
	#
	# There will then be a sequence of delta hunks.
	# Zero as the value of the first byte of a hunk in reserved.
	#
	# If bit 7 of the first byte of a delta hunk is not set, the
	# hunk is an "insert" hunk and bits 0-6 specify the number of
	# bytes to append to the output buffer from the hunk.
	#
	# If bit 7 of the first byte of a delta hunk is set, the hunk
	# is a "copy" hunk and bits 0-6 specify how the remaining
	# bytes in the hunk make up the base offset and length for the
	# copy. The following C code demonstrate how to determine the
	# base offset and length for the copy:
	#
	#	/* -  -  -  -  -  -  -  -  -  -  -  - *\
	#	 | This reflects version 3 pack files |
	#	\* -  -  -  -  -  -  -  -  -  -  -  - */
	#
	#	byte *data = delta_hunk_start
	#	opcode = *data++
	#	off_t copy_offset= 0;
	#	size_t copy_length = 0;
	#
	#	for (shift=i=0; i<4; i++) {
	#		if (opcode & 0x01) {
	#			copy_offset |= (*data++)<<shift;
	#			}
	#		opcode >>= 1;
	#		shift += 8;
	#		}
	#
	#	for (shift=i=0; i<3; i++) {
	#		if (opcode & 0x01) {
	#			copy_length |= (*data++)<<shift;
	#			}
	#		opcode >>= 1;
	#		shift += 8;
	#		}
	#
	#	if (!copy_length) {
	#		copy_length = 1<<16;
	#		}
	#
	# For version 2 pack files, the size of a copy is limited to
	# 64K bytes or less and bit 6 of the opcode byte is set if the
	# source of the copy is from the buffer of the result object
	# instead of the the base object.
	#
	# It's unknown if any version 2 pack files were created with
	# bit 6 set in the opcode byte; however, the change that added
	# support for version 3 pack files removed the code that would
	# change the copy source to the result buffer.
	#
	#	/* -  -  -  -  -  -  -  -  -  -  -  - *\
	#	 | This reflects version 2 pack files |
	#	\* -  -  -  -  -  -  -  -  -  -  -  - */
	#
	#	byte *data = delta_hunk_start
	#	opcode = *data++
	#	off_t copy_offset= 0;
	#	size_t copy_length = 0;
	#
	#	for (shift=i=0; i<4; i++) {
	#		if (opcode & 0x01) {
	#			copy_offset |= (*data++)<<shift;
	#			}
	#		opcode >>= 1;
	#		shift += 8;
	#		}
	#
	#	for (shift=i=0; i<2; i++) {
	#		if (opcode & 0x01) {
	#			copy_length |= (*data++)<<shift;
	#			}
	#		opcode >>= 1;
	#		shift += 8;
	#		}
	#
	#	if (!copy_length) {
	#		copy_length = 1<<16;
	#		}
	#
	#	copy_from_result = opcode & 0x01
	#
	;

PACK INDEX
----------
# The name of a pack file index is "pack-${PACK_ID}.idx", where
# ${PACK_ID} is the same as that of the pack file that the pack index
# corresponds to.

<PACK_INDEX>
	:	<PACK_INDEX_CONTENTS> <PACK_INDEX_CHECKSUM>
	;

<PACK_INDEX_CONTENTS>
	:	( <INDEX_PARTIAL_COUNT> ){256}
		( <PACK_OBJECT_OFFSET> <BINARY_OBJ_ID> )*
		<PACK_FILE_CHECKSUM>
	;

<INDEX_PARTIAL_COUNT>
	# 32 bit, network byte order, binary integer of the count of
	# objects in the pack file with the first byte of the object
	# ID less than or equal to the index of the count, starting
	# from zero.
	;

<PACK_OBJECT_OFFSET>
	# 32 bit, network byte order, binary integer giving the offset,
	# in bytes from the begining of the pack file, where the
	# encoding of the object starts.
	;

<PACK_INDEX_CHECKSUM>
	:	_sha-1_digest_( <PACK_INDEX_CONTENTS> )
	;

INDEX FILE (CACHE)
------------------

<INDEX_FILE>
	:	<INDEX_FILE_FORMAT_V1>
	|	<INDEX_FILE_FORMAT_V2>
	;

<INDEX_FILE_FORMAT_V1>
	# This format is no longer supported.
	:	<INDEX_HEADER> <INDEX_CHECKSUM> <INDEX_CONTENTS>
	;

<INDEX_FILE_FORMAT_V2>
	:	<INDEX_HEADER> <EXTENDED_INDEX_CONTENTS> <EXTENED_CHECKSUM>
	;

<INDEX_HEADER>
	:	"DIRC" <INDEX_FILE_VERSION> <INDEX_ENTRY_COUNT>
	;

<INDEX_FILE_VERSION>
	# 32 bit, network byte order, binary integer indicating which
	# version of the index file format was used to create the
	# index file.
	;

<INDEX_ENTRY_COUNT>
	# 32 bit, network byte order, binary integer containg the
	# number of index entries in the index file.
	;

<EXTENDED_CHECKSUM>
	:	_sha-1_digest_( <EXTENDED_INDEX_CONTENTS> )
	;

<INDEX_CHECKSUM>
	:	_sha-1_digest_( <INDEX_CONTENTS> )
	;

<INDEX_CONTENTS>
	# Index entries are sorted by the byte sequence that comprises
	# the entry name; with a secondary comparison of the stage bits
	# from the <ENTRY_FLAGS> if the entry name byte sequences are
	# identical.
	:	( <INDEX_ENTRY> )*
	;

<EXTENDED_INDEX_CONTENTS>
	:	<INDEX_CONTENTS> <INDEX_CONTENTS_EXTENSIONS>
	;

<INDEX_ENTRY>
	:	<INDEX_ENTRY_STAT_INFO>
		<ENTRY_ID>
		<ENTRY_FLAGS>
		<ENTRY_NAME>
		<NUL>
		<ENTRY_ZERO_PADDING>
	;

<ENTRY_ZERO_PADDING>
	# The minimum length 0x00 byte sequence necessary to make the
	# written of digested byte length of the <INDEX_ENTRY> a
	# multiple of 8.
	;

<INDEX_ENTRY_STAT_INFO>
	# These fields are used as a part of a heuristic to determine
	# if the file system entity associated with this entry has
	# changed. The names are very *nix centric but the exact
	# contents of each field have no meaning to Git, besides exact
	# match, except for the <ENTRY_MODE> and <ENTRY_SIZE> fields.
	:	<ENTRY_CTIME>
		<ENTRY_MTIME>
		<ENTRY_DEV>
		<ENTRY_INODE>
		<ENTRY_MODE>
		<ENTRY_UID>
		<ENTRY_GID>
		<ENTRY_SIZE>
	;

<ENTRY_CTIME>
	# The timestamp of the last status change of the associated
	# file system entity.
	:	<ENTRY_TIME>
	;

<ENTRY_MTIME>
	# The timestamp of the last modification of the associated
	# file system entity.
	:	<ENTRY_TIME>
	;

<ENTRY_TIME>
	:	<TIME_LSB32> <TIME_NSEC>
	;

<TIME_LSB32>
	# 32 bit, network byte order, binary integer containg the lower
	# 32 bits of the entry (file or symbolic link) timestamp.
	;

<TIME_NSEC>
	# 32 bit, network byte order, binary integer containg the lower
	# 32 bits of the entry (file or symbolic link) more precise
	# timestamp, if available.
	;

<ENTRY_DEV>
	# 32 bit, network byte order, binary integer containg the lower
	# 32 bits of the entry (file or symbolic link) file system
	# device identifier. Use of this field is a compile time
	# option.
	;

<ENTRY_INODE>
	# 32 bit, network byte order, binary integer containg the lower
	# 32 bits of the entry (file or symbolic link) inode number, or
	# equivalent.
	;

<ENTRY_MODE>
	# 32 bit, network byte order, binary integer containg the lower
	# 32 bits of the entry (file or symbolic link) file system
	# entity type and permissions.
	;

<ENTRY_UID>
	# 32 bit, network byte order, binary integer containg the lower
	# 32 bits of the entry (file or symbolic link) file system
	# entity owner identifier.
	;

<ENTRY_GID>
	# 32 bit, network byte order, binary integer containg the lower
	# 32 bits of the entry (file or symbolic link) file system
	# entity group identifier, or equivalent.
	;

<ENTRY_SIZE>
	# 32 bit, network byte order, binary integer containg the lower
	# 32 bits of the entry (file or symbolic link) size.
	;

<ENTRY_ID>
	# Object ID of the of the file system entity contents.
	;

<ENTRY_FLAGS>
	# 16 bit, network byte order, binary integer.
	#	bits 15-14	Reserved
	#	bits 13-12	Entry stage
	#	bits 11-0	Name byte length
	#
	# See git-read-tree(1) for a description of how the stage
	# field is used.
	;

<ENTRY_NAME>
	# File system entity name. Path is normalized and relative to
	# the working directory.
	;

<INDEX_CONTENTS_EXTENSIONS>
	:	( <INDEX_EXTENSION> )*
	;

<INDEX_EXTENSION>
	:	<INDEX_EXTENSION_HEADER>
		<INDEX_EXTENSION_DATA>
	;

<INDEX_EXTENSION_HEADER>
	:	<INDEX_EXTENSION_NAME> <INDEX_EXTENSION_DATA_SIZE>
	;

<INDEX_EXTENSION_NAME>
	# 4 byte sequence identifying how the <INDEX_EXTENSION_DATA>
	# should be interpreted. If the first byte has a value greater
	# than or equal to the ASCII character 'A' (0x41) and less than
	# or equal to the ASCII character 'Z' (0x5a), the extension is
	# optional and does not affect the interpretation of the other
	# contents in the index file. Any non-optional extensions must
	# be understood by the reading application to correctly
	# interpret the index file contents.
	;

<INDEX_EXTENSION_DATA_SIZE>
	# 32 bit, network byte order, binary integer containg the
	# length of the <INDEX_EXTENSION_DATA> byte sequence.
	;

<INDEX_EXTENSION_DATA>
	# Sequence of bytes.
	;