I was going through this thread <http://markmail.org/message/mge2spy7uqle5ghy#query:diffcore-rename.c%20algorithm+page:1+mid:ji7jkzypzqpml2xn+state:results> trying to understand how the similarity scoring for modified files works. I see that the algorithm uses two hashes, one for each file, and proceeds to compare the 2 hashes to determine percentage similarity. I think I have an algorithm which uses only one hash and has a worst case space complexity of O(N) where N is the number of lines inserted/deleted/moved. The algorithm is as follows: def similarity(leftFileContent, rightFileContent) { int lineCounter = 0; struct HashElement { HashElement(): leftFileCount(0), rightFileCount(0) // One of these counters will remain 0 for its lifetime. The other will be >=0 int leftFileCount; // Increment if line is from leftFile int rightFileCount; // Increment if line is from right file. }; Hash<lineContent, HashElement> hash; //One hash for both files int similartyCounter = 0; while(true) { leftLline = leftFileContent[lineCounter]; rightLine = rightFileContent[lineCounter]; if (hash.contains(leftLine)) { HashElement h = hash.value(leftLine); if (h.leftFileCount > 0) {//The previous occurence has come from the left file h.leftFileCount++; //This line is another occurence of the same line in the same file. So we increment the counter. } else { h.leftFileCount--; similarityCounter++; //Hey, this line matched an earlier occurence from the other file! } if (h.leftFileCount == 0 && h.rightFileCount == 0) hash.remove(leftLine); } else { // this line is being encountered for the first time, so add it to the hash HashElement h; h.leftFileCount++; hash.add(leftLine, h); } if (hash.contains(rightLine)) { HashElement h = hash.value(rightLine); if (h.rightFileCount > 0) {//The previous occurence has come from the right file h.rightFileCount++; //This line is another occurence of the same line in the same file. So we increment the counter. } else { h.rightFileCount--; similarityCounter++; //Hey, this line matched an earlier occurence from the other file! } if (h.leftFileCount == 0 && h.rightFileCount == 0) hash.remove(line2); } else { // this line is being encountered for the first time, so add it to the hash HashElement h; h.rightFileCount++; hash.add(line2, h); } lineCounter++; if (lineCounter == leftFileContent.size() || lineCounter == rightFileContent.size()) break; } /* Upto this point we have processed an equal number of lines in both the files simultaneously. Now we may have a file which is larger in size than the other. We need to process the remanant of this file*/ bool leftFileIsLonger; FileContent fileContent; int size; if (leftFileContent.size() == rightFileContent.size()) return ((similarityCounter*100) / rightFileContent.size()); if (leftFileContent.size() >rightFileContent.size()) { leftFileIsLonger = true; fileContent = leftFileContent; //fileContent should really be a pointer to leftFileContent size = leftFileContent.size(); } else { leftFileIsLonger = false; fileContent = rightFileContent; //fileContent should really be a pointer to rightFileContent size = rightFileContent.size(); } for (i = lineCounter + 1; i < size; i++ ) { line = fileContent[i]; if (hash.contains(line)) { Hash h = hash.value(line); if (h.leftLineCount > 0) { //Line in hash came from left file if (leftFileIsLonger) h.leftFileCount++; //Even this line came from the same file, so its just a repeated line. else { h.leftFileCount--; similarityCounter++; //This line came from the right file, so increase similarityCounter and decrease the leftCounter. } } else { //Line in hash came from right file if (leftFileIsLonger) { h.rightFileCount--; similarityCounter++; } else h.rightFileCount++; } if (h.leftFileCount == 0 && h.rightFileCount == 0) hash.remove(line); } } return ((similarityCounter*100) / size); //Formula for calculating similarity is same as git uses } -- Debayan Banerjee Software Engineer -- To unsubscribe from this list: send the line "unsubscribe git" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html