Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx> writes: > Why? That directory has a binary file, which doesn't have a "line count", > so --dirstat actually ends up using the byte-count instead there, which > inflates the macosx numbers a lot (same is true of git-gui/lib/, just to a > smaller degree). > > I don't think that's necessarily a bug, but it's certainly misleading. > Does it matter? Not to me, because the kernel generally doesn't have those > kinds of issues (no binary blobs that ever really change), but I did want > to point out that there's room for perhaps improving things. Maybe we > could add a byte count in general to the diffstat data structures and > always just count bytes changed rather than lines changed. Or maybe we > should consider binary files to have something like "32 bytes per virtual > line" or something. You can drop the "line count" idea and instead count the "amount of damage" done to the preimage, just like we do in the rename similarity computation. The attached patch is just an outline. There may need special cases for unmerged paths. Comparison with your examples is a bit interesting: (non cumulative) (linus, non cumulative) 18.6% crypto/ 20.5% crypto/ 7.7% fs/afs/ 7.6% fs/afs/ 10.9% fs/fuse/ 7.6% fs/fuse/ 7.4% fs/gfs2/ 7.6% fs/gfs2/ 5.3% fs/jffs2/ 5.1% fs/jffs2/ 4.9% fs/nfs/ 5.1% fs/nfs/ 4.5% fs/nfsd/ 5.1% fs/nfsd/ 7.4% fs/reiserfs/ 7.6% fs/reiserfs/ 14.4% fs/ 15.3% fs/ 7.4% net/rxrpc/ 7.6% net/rxrpc/ 10.8% security/keys/ 10.2% security/keys/ So it does not change much for text-only project. But the big binary file difference is now in the noise: (git) (linus, git) 12.4% Documentation/ 10.0% Documentation/ 7.3% contrib/ 4.9% contrib/ 6.0% git-gui/lib/ 5.7% git-gui/lib/ 12.9% git-gui/po/ 13.9% git-gui/macosx/ 23.0% git-gui/ 8.7% git-gui/po/ 7.3% gitk-git/ 31.4% git-gui/ 14.1% t/ 4.6% gitk-git/ 13.1% t/ git-gui/lib is about 220k while macosx is 30k, when counted with "find $there -type f | xargs cat | wc -c", so this reflects the reality much better. I also suspect --cumulative should imply --dirstat but that is a different patch if ever. diff.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++--------------- 1 files changed, 61 insertions(+), 19 deletions(-) diff --git a/diff.c b/diff.c index dd374d4..8ad04ac 100644 --- a/diff.c +++ b/diff.c @@ -990,18 +990,23 @@ static void show_numstat(struct diffstat_t* data, struct diff_options *options) } } -struct diffstat_dir { - struct diffstat_file **files; - int nr, percent, cumulative; +struct dirstat_file { + const char *name; + unsigned long changed; }; -static long gather_dirstat(struct diffstat_dir *dir, unsigned long changed, const char *base, int baselen) +struct dirstat_dir { + struct dirstat_file *files; + int alloc, nr, percent, cumulative; +}; + +static long gather_dirstat(struct dirstat_dir *dir, unsigned long changed, const char *base, int baselen) { unsigned long this_dir = 0; unsigned int sources = 0; while (dir->nr) { - struct diffstat_file *f = *dir->files; + struct dirstat_file *f = dir->files; int namelen = strlen(f->name); unsigned long this; char *slash; @@ -1016,7 +1021,7 @@ static long gather_dirstat(struct diffstat_dir *dir, unsigned long changed, cons this = gather_dirstat(dir, changed, f->name, newbaselen); sources++; } else { - this = f->added + f->deleted; + this = f->changed; dir->files++; dir->nr--; sources += 2; @@ -1044,17 +1049,58 @@ static long gather_dirstat(struct diffstat_dir *dir, unsigned long changed, cons return this_dir; } -static void show_dirstat(struct diffstat_t *data, struct diff_options *options) +static void show_dirstat(struct diff_options *options) { int i; unsigned long changed; - struct diffstat_dir dir; + struct dirstat_dir dir; + struct diff_queue_struct *q = &diff_queued_diff; + + dir.files = NULL; + dir.alloc = 0; + dir.nr = 0; + dir.percent = options->dirstat_percent; + dir.cumulative = options->output_format & DIFF_FORMAT_CUMULATIVE; - /* Calculate total changes */ changed = 0; - for (i = 0; i < data->nr; i++) { - changed += data->files[i]->added; - changed += data->files[i]->deleted; + for (i = 0; i < q->nr; i++) { + struct diff_filepair *p = q->queue[i]; + const char *name; + unsigned long copied, added, damage; + + name = p->one->path ? p->one->path : p->two->path; + + if (DIFF_FILE_VALID(p->one) && DIFF_FILE_VALID(p->two)) { + diff_populate_filespec(p->one, 0); + diff_populate_filespec(p->two, 0); + diffcore_count_changes(p->one, p->two, NULL, NULL, 0, + &copied, &added); + diff_free_filespec_data(p->one); + diff_free_filespec_data(p->two); + } else if (DIFF_FILE_VALID(p->one)) { + diff_populate_filespec(p->one, 1); + copied = added = 0; + diff_free_filespec_data(p->one); + } else if (DIFF_FILE_VALID(p->two)) { + diff_populate_filespec(p->two, 1); + copied = 0; + added = p->two->size; + diff_free_filespec_data(p->two); + } else + continue; + + /* + * Original minus copied is the removed material, + * added is the new material. They are both damages + * made to the preimage. + */ + damage = (p->one->size - copied) + added; + + ALLOC_GROW(dir.files, dir.nr + 1, dir.alloc); + dir.files[dir.nr].name = name; + dir.files[dir.nr].changed = damage; + changed += damage; + dir.nr++; } /* This can happen even with many files, if everything was renames */ @@ -1062,10 +1108,6 @@ static void show_dirstat(struct diffstat_t *data, struct diff_options *options) return; /* Show all directories with more than x% of the changes */ - dir.files = data->files; - dir.nr = data->nr; - dir.percent = options->dirstat_percent; - dir.cumulative = options->output_format & DIFF_FORMAT_CUMULATIVE; gather_dirstat(&dir, changed, "", 0); } @@ -3024,7 +3066,7 @@ void diff_flush(struct diff_options *options) separator++; } - if (output_format & (DIFF_FORMAT_DIFFSTAT|DIFF_FORMAT_SHORTSTAT|DIFF_FORMAT_NUMSTAT|DIFF_FORMAT_DIRSTAT)) { + if (output_format & (DIFF_FORMAT_DIFFSTAT|DIFF_FORMAT_SHORTSTAT|DIFF_FORMAT_NUMSTAT)) { struct diffstat_t diffstat; memset(&diffstat, 0, sizeof(struct diffstat_t)); @@ -3034,8 +3076,6 @@ void diff_flush(struct diff_options *options) if (check_pair_status(p)) diff_flush_stat(p, options, &diffstat); } - if (output_format & DIFF_FORMAT_DIRSTAT) - show_dirstat(&diffstat, options); if (output_format & DIFF_FORMAT_NUMSTAT) show_numstat(&diffstat, options); if (output_format & DIFF_FORMAT_DIFFSTAT) @@ -3045,6 +3085,8 @@ void diff_flush(struct diff_options *options) free_diffstat_info(&diffstat); separator++; } + if (output_format & DIFF_FORMAT_DIRSTAT) + show_dirstat(options); if (output_format & DIFF_FORMAT_SUMMARY && !is_summary_empty(q)) { for (i = 0; i < q->nr; i++) - To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html