Re: [RFH] Exploration of an alternative diff_delta() algorithm

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Sun, Apr 09, 2006 at 01:45:00PM -0400, Nicolas Pitre wrote:
...
> Try this with the README file from the git source tree:
> 
> 	sed s/git/GIT/g < ./README > /tmp/README.mod
> 	test-delta -d ./README /tmp/README.mod /tmp/README.delta
> 	[BOOM!]

I found the bug.  The code still has some limitations, but now
it passes the test suite.  Thanks for your help, Nicolas.

Peter

----->8---diff-delta.c---->8-------
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include "delta.h"


#define BASE 257
#define PREFIX_SIZE 3

#define SIZE 10
#define HASH_TABLE_SIZE (1<<SIZE)

#define DELTA_SIZE (1024 * 1024)


unsigned int init_hash(unsigned char* data) {
  return data[0]*BASE*BASE + data[1]*BASE + data[2];
}

unsigned int hash(unsigned char* data, unsigned int hash) {
  return (hash - data[-1]*BASE*BASE)*BASE + data[2];
}

#define GR_PRIME 0x9e370001
#define HASH(v) ((v * GR_PRIME) >> (32 - SIZE))

struct entry {
  char file;
  char* offset;
};


void flush(struct entry* table) {
  memset(table, 0, HASH_TABLE_SIZE * sizeof(struct entry));
}


int same_prefixes(char* data1, char* data2) {
  return !memcmp(data1, data2, PREFIX_SIZE);  
}


void encode_add(char* out, int* outpos, char* version_start, char* version_copy) {
  unsigned int size = version_copy - version_start;
  if (!size) return;
  int pos = *outpos;

  while(size > 127) {
    out[pos++] = 127;
    memcpy(out + pos, version_start, 127);
    pos += 127;
    version_start += 127;
    size -= 127;
  }
  out[pos++] = size;
  memcpy(out + pos, version_start, size);  
  pos += size;

  *outpos = pos;
}


void encode_copy(char* out, int* outpos, int offset, int size) {
     int pos = (*outpos) + 1;
     int i = 0x80;

     if (offset & 0xff) { out[pos++] = offset; i |= 0x01; }
     offset >>= 8;
     if (offset & 0xff) { out[pos++] = offset; i |= 0x02; }
     offset >>= 8;
     if (offset & 0xff) { out[pos++] = offset; i |= 0x04; }
     offset >>= 8;
     if (offset & 0xff) { out[pos++] = offset; i |= 0x08; }

     if (size & 0xff) { out[pos++] = size; i |= 0x10; }
     size >>= 8;
     if (size & 0xff) { out[pos++] = size; i |= 0x20; }

     out[*outpos] = i;
     *outpos = pos;
}



void encode_size(char* out, int* outpos, unsigned long size) {
  int pos = *outpos;
  out[pos] = size;
  size >>= 7;
  while (size) {
    out[pos++] |= 0x80;
    out[pos] = size;
    size >>= 7;
  }
  *outpos = ++pos;
}




void *diff_delta(void *from_buf, unsigned long from_size,
		 void *to_buf, unsigned long to_size,
		 unsigned long *delta_size,
		 unsigned long max_size) {
  unsigned int index;
  unsigned int l;
  unsigned char* base = from_buf;
  unsigned char* version = to_buf;
  unsigned long base_size = from_size;
  unsigned long version_size = to_size;

  unsigned char* base_copy = base;
  unsigned char* version_copy = version;
  struct entry* table = calloc(HASH_TABLE_SIZE, sizeof(struct entry));
  //int delta_alloc = DELTA_SIZE;
  unsigned char* delta = malloc(DELTA_SIZE);
  unsigned int deltapos = 0;
  unsigned char* base_top = base + base_size;
  unsigned char* version_top = version + version_size;

  encode_size(delta, &deltapos, base_size);
  encode_size(delta, &deltapos, version_size);

  unsigned char* base_offset = base;
  unsigned char* version_offset = version;
  unsigned int base_hash = init_hash(base);
  unsigned int version_hash = init_hash(version);
  unsigned char* version_start = version;

  while(base_offset - base + PREFIX_SIZE < base_top - base && 
	version_offset - version + PREFIX_SIZE < version_top - version) {  
    // step2:

    index = HASH(base_hash);
    switch (table[index].file) {
    case '\0': {
      table[index].file = 'b';
      table[index].offset = base_offset;
      break;
    }
    case 'v': {
      if (same_prefixes(base_offset, table[index].offset)) {
	base_copy = base_offset;
	version_copy = table[index].offset;
	goto step3;
      } else break;
    }
    case 'b': break;
    default: printf("AAAAAARGH 2b\n");
    }
    
    index = HASH(version_hash);
    switch (table[index].file) {
    case '\0': {
      table[index].file = 'v';
      table[index].offset = version_offset;
      break;
    }
    case 'b': {
      if (same_prefixes(table[index].offset, version_offset)) {
	base_copy = table[index].offset;
	version_copy = version_offset;
	goto step3;
      } else break;
    }
    case 'v': break;
    default: printf("AAAAAARGH 2v\n");
    }
    
    base_offset++;
    version_offset++;

    base_hash = hash(base_offset, base_hash);
    version_hash = hash(version_offset, version_hash);
    continue;  //  goto step2;
    
  step3:
    l = 0;
    while(base_copy[l] == version_copy[l] && base_copy + l < base_top && version_copy + l < version_top) l++;
    base_offset = base_copy + l;
    version_offset = version_copy + l;
    
    /*
    // Make sure we don't run out of delta buffer when encoding.
    if((delta_alloc - deltapos) < 
       (version_start - version_copy) + 1 + 8 + (PREFIX_SIZE + 1)) {
      delta_alloc = delta_alloc * 3 / 2;
      delta = (char*) realloc(delta, delta_alloc);
    }
    */
	if(max_size && deltapos > max_size) {
		free(delta);
		free(table);
		return NULL;
	}

	//fprintf(stdout, "add: pos %u, v_start %u, v_copy %u\n", 
	//	deltapos, version_start - version, version_copy - version);	


    // step4:
    encode_add(delta, &deltapos, version_start, version_copy);

	//fprintf(stdout, "copy: pos %u, v_copy %u, l %u\n", 
	//	deltapos, base_copy - base, l);	


    encode_copy(delta, &deltapos, base_copy - base, l);
    
    // step5:
    flush(table);
    
    version_start = version_offset;
    
    base_hash = init_hash(base_offset);
    version_hash = init_hash(version_offset);
    
	//fprintf(stdout, "3) pos %u, v_start %u, v %u, b %u\n", 
	//	deltapos, version_start - version, version_offset - version, base_offset- base);	
  }  //  goto step2;
  
	//fprintf(stdout, "pos %u, v_start %u, v_top %u\n", 
	//	deltapos, version_start - version, version_size);	
  encode_add(delta, &deltapos, version_start, version + version_size);
  *delta_size = deltapos;
  free(table);
  return delta;
}

-
: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]