Re: [PATCH 6/7] Add SVN revision parser and exporter

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi again,

Ramkumar Ramachandra wrote:

> repo_tree parses SVN revisions to build a Git objects, and use
> fast_export to emit them so they can be imported into the Git object
> store via a fast-import.

Wait, where are SVN revisions being parsed?  It seems that the repo
module maintains the exporter's state and provides a facility to
to call the fast_export module to write it out.

  repo_add(path, mode, blob_mark) is used to add a new file to
  the current commit.

  repo_modify is used to add a replacement for an existing file;
  it is implemented exactly the same way, but a check could be
  added later to distinguish the two cases.

  repo_copy copies a blob from a previous revision to the current
  commit.

  repo_replace modifies the content of a file from the current
  commit, if and only if it already exists.

  repo_delete removes a file or directory from the current commit.

  repo_commit calls out to fast_export to write the current commit
  to the fast-import stream in stdout.

  repo_diff is used by the fast_export module to write the changes
  for a commit.

  repo_reset erases the exporter's state, so valgrind can be happy.

> +void fast_export_modify(uint32_t depth, uint32_t *path, uint32_t mode,
> +                        uint32_t mark)
> +{
> +    printf("M %06o :%d ", mode, mark);
> +    pool_print_seq(depth, path, '/', stdout);
> +    putchar('\n');
> +}

Mode must be 100644, 100755, 120000, or 160000.

[...]
> +typedef struct repo_dirent_s repo_dirent_t;
> +
> +struct repo_dirent_s {
> +    uint32_t name_offset;
> +    uint32_t mode;
> +    uint32_t content_offset;
> +};
> +
> +typedef struct repo_dir_s repo_dir_t;
> +
> +struct repo_dir_s {
> +    uint32_t size;
> +    uint32_t first_offset;
> +};
> +
> +typedef struct repo_commit_s repo_commit_t;
> +
> +struct repo_commit_s {
> +    uint32_t mark;
> +    uint32_t root_dir_offset;
> +};

Style: we don’t tend to use typedef to hide underlying struct definitions
(see Documentation/CodingStyle from linux-2.6.git, chapter 5, for some
explanation about why).

> +static uint32_t num_dirs_saved = 0;
> +static uint32_t num_dirents_saved = 0;

Style: leave out the initializer for bss-allocated variables.

> +static int repo_dirent_name_cmp(const void *a, const void *b)
> +{
> +    return (((repo_dirent_t *) a)->name_offset
> +            > ((repo_dirent_t *) b)->name_offset) -
> +        (((repo_dirent_t *) a)->name_offset
> +         < ((repo_dirent_t *) b)->name_offset);
> +}

Maybe some local variables would make this more readable:

 {
	const repo_dirent_t *dirent1 = a, *dirent2 = b;
	uint32_t a_offset = dirent1->name_offset;
	uint32_t b_offset = dirent2->name_offset;

	return (a_offset > b_offset) - (a_offset < b_offset);
 }

> +static repo_dir_t *repo_clone_dir(repo_dir_t *orig_dir, uint32_t padding)
> +{
> +    uint32_t orig_o, new_o, dirent_o;
> +    orig_o = dir_offset(orig_dir);
> +    if (orig_o < num_dirs_saved) {
> +        new_o = dir_with_dirents_alloc(orig_dir->size + padding);
> +        orig_dir = dir_pointer(orig_o);
> +        dirent_o = dir_pointer(new_o)->first_offset;
> +    } else {
> +        if (padding == 0)
> +            return orig_dir;
> +        new_o = orig_o;
> +        dirent_o = dirent_alloc(orig_dir->size + padding);
> +    }
> +    memcpy(dirent_pointer(dirent_o), repo_first_dirent(orig_dir),
> +           orig_dir->size * sizeof(repo_dirent_t));
> +    dir_pointer(new_o)->size = orig_dir->size + padding;
> +    dir_pointer(new_o)->first_offset = dirent_o;
> +    return dir_pointer(new_o);
> +}

Is this for adding new entries to an existing directory?  It is
getting late, so I did not look it over carefully.

> +static char repo_path_buffer[REPO_MAX_PATH_LEN];
> +static repo_dirent_t *repo_read_dirent(uint32_t revision, char *path)
> +{
> +    char *ctx = NULL;
> +    uint32_t name = 0;
> +    repo_dir_t *dir = NULL;
> +    repo_dirent_t *dirent = NULL;
> +    dir = repo_commit_root_dir(commit_pointer(revision));
> +    strncpy(repo_path_buffer, path, REPO_MAX_PATH_LEN);
> +    repo_path_buffer[REPO_MAX_PATH_LEN - 1] = '\0';
> +    path = repo_path_buffer;
> +    for (name = pool_tok_r(path, "/", &ctx);
> +         ~name; name = pool_tok_r(NULL, "/", &ctx)) {
> +        dirent = repo_dirent_by_name(dir, name);
> +        if (dirent == NULL) {
> +            return NULL;
> +        } else if (repo_dirent_is_dir(dirent)) {
> +            dir = repo_dir_from_dirent(dirent);
> +        } else {
> +            break;
> +        }
> +    }
> +    return dirent;
> +}

If a file "foo" exists and I ask for "foo/bar", this will return
the entry for foo.  Is that appropriate?

> +static void
> +repo_write_dirent(char *path, uint32_t mode, uint32_t content_offset,
> +                  uint32_t del)
> +{
> +    char *ctx;
> +    uint32_t name, revision, dirent_o, dir_o, parent_dir_o;
> +    repo_dir_t *dir;
> +    repo_dirent_t *dirent = NULL;
> +    revision = active_commit;
> +    dir = repo_commit_root_dir(commit_pointer(revision));
> +    dir = repo_clone_dir(dir, 0);
> +    commit_pointer(revision)->root_dir_offset = dir_offset(dir);
> +    strncpy(repo_path_buffer, path, REPO_MAX_PATH_LEN);
> +    repo_path_buffer[REPO_MAX_PATH_LEN - 1] = '\0';
> +    path = repo_path_buffer;
> +    for (name = pool_tok_r(path, "/", &ctx); ~name;
> +         name = pool_tok_r(NULL, "/", &ctx)) {
> +        parent_dir_o = dir_offset(dir);
> +        dirent = repo_dirent_by_name(dir, name);
> +        if (dirent == NULL) {

static repo_dir_t *add_subdir(uint32_t parent_dir_o, repo_dir_t *dir,
                              uint32_t name)
{

> +            dir = repo_clone_dir(dir, 1);
> +            dirent = &repo_first_dirent(dir)[dir->size - 1];
> +            dirent->name_offset = name;
> +            dirent->mode = REPO_MODE_DIR;
> +            qsort(repo_first_dirent(dir), dir->size,
> +                  sizeof(repo_dirent_t), repo_dirent_name_cmp);
> +            dirent = repo_dirent_by_name(dir, name);
> +            dir_o = dir_with_dirents_alloc(0);
> +            dirent->content_offset = dir_o;
> +            dir = dir_pointer(dir_o);

}

> +        } else if ((dir = repo_dir_from_dirent(dirent))) {
> +            dirent_o = dirent_offset(dirent);
> +            dir = repo_clone_dir(dir, 0);
> +            if (dirent_o != ~0)
> +                dirent_pointer(dirent_o)->content_offset = dir_offset(dir);
> +        } else {

static repo_dir_t *replace_with_empty_dir(uint32_t dirent)
{

> +            dirent->mode = REPO_MODE_DIR;
> +            dirent_o = dirent_offset(dirent);
> +            dir_o = dir_with_dirents_alloc(0);
> +            dirent = dirent_pointer(dirent_o);
> +            dir = dir_pointer(dir_o);
> +            dirent->content_offset = dir_o;

}

> +        }
> +    }
> +    if (dirent) {

When would it be NULL?  Is an empty path allowed?

static void fill_dirent(repo_dirent_t *dirent,
                        uint32_t mode, uint32_t content_offset,
                        uint32_t parent_dir_o, int del)
{

> +        dirent->mode = mode;
> +        dirent->content_offset = content_offset;
> +        if (del && ~parent_dir_o) {
> +            dirent->name_offset = ~0;
> +            dir = dir_pointer(parent_dir_o);
> +            qsort(repo_first_dirent(dir), dir->size,
> +                  sizeof(repo_dirent_t), repo_dirent_name_cmp);
> +            dir->size--;
> +        }

}

> +    }
> +}
[...]

> +static void
> +repo_git_add_r(uint32_t depth, uint32_t *path, repo_dir_t *dir);
> +
> +static void
> +repo_git_add(uint32_t depth, uint32_t *path, repo_dirent_t *dirent)
> +{
> +    if (repo_dirent_is_dir(dirent)) {
> +        repo_git_add_r(depth, path, repo_dir_from_dirent(dirent));
> +    } else {
> +        fast_export_modify(depth, path, dirent->mode, dirent->content_offset);
> +    }
> +}

Just curious: does gcc notice the tail calls?

> +static void
> +repo_git_add_r(uint32_t depth, uint32_t *path, repo_dir_t *dir)
> +{
> +    uint32_t o;
> +    repo_dirent_t *de;
> +    de = repo_first_dirent(dir);
> +    for (o = 0; o < dir->size; o++) {
> +        path[depth] = de[o].name_offset;
> +        repo_git_add(depth + 1, path, &de[o]);
> +    }
> +}

Can this overflow the path stack?

> +
> +static void
> +repo_diff_r(uint32_t depth, uint32_t *path, repo_dir_t *dir1,
> +            repo_dir_t *dir2)
> +{
> +    repo_dirent_t *de1, *de2, *max_de1, *max_de2;
> +    de1 = repo_first_dirent(dir1);
> +    de2 = repo_first_dirent(dir2);
> +    max_de1 = &de1[dir1->size];
> +    max_de2 = &de2[dir2->size];
> +
> +    while (de1 < max_de1 && de2 < max_de2) {
> +        if (de1->name_offset < de2->name_offset) {
> +            path[depth] = (de1++)->name_offset;
> +            fast_export_delete(depth + 1, path);
> +        } else if (de1->name_offset == de2->name_offset) {
> +            path[depth] = de1->name_offset;
> +            if (de1->content_offset != de2->content_offset) {
> +                if (repo_dirent_is_dir(de1) && repo_dirent_is_dir(de2)) {
> +                    repo_diff_r(depth + 1, path,
> +                                repo_dir_from_dirent(de1),
> +                                repo_dir_from_dirent(de2));
> +                } else {
> +                    if (repo_dirent_is_dir(de1) != repo_dirent_is_dir(de2)) {
> +                        fast_export_delete(depth + 1, path);
> +                    }
> +                    repo_git_add(depth + 1, path, de2);
> +                }
> +            }
> +            de1++;
> +            de2++;
> +        } else {
> +            path[depth] = de2->name_offset;
> +            repo_git_add(depth + 1, path, de2++);
> +        }

Might be easier to read the other way around:

		if (de1->name_offset < de2->name_offset) {
			path[depth] = (de1++)->name_offset;
			fast_export_delete(depth + 1, path);
			continue;
		}
		if (de1->name_offset > de2->name_offset) {
			path[depth] = de2->name_offset;
			repo_git_add(depth + 1, path, de2++);
			continue;
		}
		... matching paths case ...

> +    }
> +    while (de1 < max_de1) {
> +        path[depth] = (de1++)->name_offset;
> +        fast_export_delete(depth + 1, path);
> +    }
> +    while (de2 < max_de2) {
> +        path[depth] = de2->name_offset;
> +        repo_git_add(depth + 1, path, de2++);
> +    }
> +}
[...]

> +void repo_commit(uint32_t revision, char *author, char *log, char *uuid,
> +                 char *url, time_t timestamp)
> +{
> +    if (revision == 0) {
> +        active_commit = commit_alloc(1);
> +        commit_pointer(active_commit)->root_dir_offset =
> +            dir_with_dirents_alloc(0);
> +    } else {
> +        fast_export_commit(revision, author, log, uuid, url, timestamp);
> +    }
> +    num_dirs_saved = dir_pool.size;
> +    num_dirents_saved = dirent_pool.size;

I did not carefully trace the cases where repo_clone_dir may reuse a
dir.  I would be happier if someone else does.

The rest looked good to me, for what it’s worth.

Thanks,
Jonathan
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Kernel Development]     [Gcc Help]     [IETF Annouce]     [DCCP]     [Netdev]     [Networking]     [Security]     [V4L]     [Bugtraq]     [Yosemite]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Linux SCSI]     [Fedora Users]