Hi- I'm submitting a patch that provides an optimization to the isomd5sum media check functionality. Presently, the algorithm calculates a single md5sum after reading the entire disc, and reports a "PASS" or "FAIL" at that time. When there is corrupted data on the disc, a user will not know until the entire disc has been read--which can take quite a while. It would be optimal for the algorithm to exit the checking and report a failure as soon after reading the corrupted section as possible. I've accomplished this by adding two extra fields to the application data header of the ISO image. The ISO 9660 spec allows for a ~512-character string, of which were 224 characters are used. I've added two fields, "FRAGMENT SUMS" and "FRAGMENT COUNT", after which 326 characters are used. The "FRAGMENT COUNT" is an integer representing the number of fragments in which to break the entire ISO. The "FRAGMENT SUMS" is an ascii string of concatenated leading characters of md5sums of subsequent fragments. I've #define'd the length of FRAGMENT SUMS to 60 characters, as this number is evenly divisible by (2, 3, 4, 5, 6, 10, 12, 15, 20, 30). I've also #define'd FRAGMENT COUNT to be 20, which means that ISO will be broken into 20 fragments, each fragment represented by a 3-character string (the first 3 characters of the fragment's md5sum) in FRAGMENT SUMS. Looking at an example, the application data header used to look like: ISO MD5SUM = 1fca2222a947399d5b4a4226629be254;SKIPSECTORS = 15;RHLISOSTATUS=1;THIS IS NOT THE SAME AS RUNNING MD5SUM ON THIS ISO!! It now looks like: ISO MD5SUM = 1fca2222a947399d5b4a4226629be254;SKIPSECTORS = 15;RHLISOSTATUS=0;FRAGMENT SUMS = 67bd3263cccfda2d851a493b9a238e8877c45bca78e276e94437dd9f26e4;FRAGMENT COUNT= 20;THIS IS NOT THE SAME AS RUNNING MD5SUM ON THIS ISO!! The string [67bd3263cccfda2d851a493b9a238e8877c45bca78e276e94437dd9f26e4] is actually a concatenation of: md5(frag(0))[0-2] = 67b md5(frag(1))[0-2] = d32 md5(frag(2))[0-2] = 63c md5(frag(3))[0-2] = ccf md5(frag(4))[0-2] = da2 md5(frag(5))[0-2] = d85 md5(frag(6))[0-2] = 1a4 md5(frag(7))[0-2] = 93b md5(frag(8))[0-2] = 9a2 md5(frag(9))[0-2] = 38e md5(frag(10))[0-2] = 887 md5(frag(11))[0-2] = 7c4 md5(frag(12))[0-2] = 5bc md5(frag(13))[0-2] = a78 md5(frag(14))[0-2] = e27 md5(frag(15))[0-2] = 6e9 md5(frag(16))[0-2] = 443 md5(frag(17))[0-2] = 7dd md5(frag(18))[0-2] = 9f2 md5(frag(19))[0-2] = 6e4 Thus, if any one of these prescribed strings does not match what is calculated, the checking exits immediately, as a corruption has been detected. In the cases where the error is near the beginning of the disc, the time to identify the problem is greatly reduced. Note that there is a chance of collision. When the disc is broken into 20 fragments, each fragment is represented by a 3-character string, yielding a collision 1 in 16^3=4096 cases. Such a collision only means that the user will not benefit from the optimization, as the entire disc will be checked by the last operation just like it has always been done and the error will be detected at that time. One can reduce the chances of collision by decreasing the number of fragments, or expanding the length of the FRAGMENT SUMS string. This optimization occurs at virtually no performance cost. Media checking is overwhelmingly I/O bound, reading from disc. The additional calculations of smaller sums adds little additional processing time, as most of that time was being spent waiting on new data from disc. Please consider for inclusion in Anaconda, and I'll be glad to answer any questions or address concerns. I think the patch is pretty simple, and isolated to two files. Thanks! Dustin Kirkland
diff -Naur anaconda-10.1.0.2/isomd5sum/libcheckisomd5.c anaconda-10.1.0.2.dustin/isomd5sum/libcheckisomd5.c --- anaconda-10.1.0.2/isomd5sum/libcheckisomd5.c 2005-03-25 13:42:57.325502288 -0500 +++ anaconda-10.1.0.2.dustin/isomd5sum/libcheckisomd5.c 2005-03-25 14:16:33.118055240 -0500 @@ -15,16 +15,20 @@ #define APPDATA_OFFSET 883 #define SIZE_OFFSET 84 +/* Length in characters of string used for fragment md5sum checking */ +#define FRAGMENT_SUM_LENGTH 60 + #define MAX(x, y) ((x > y) ? x : y) #define MIN(x, y) ((x < y) ? x : y) /* finds primary volume descriptor and returns info from it */ /* mediasum must be a preallocated buffer at least 33 bytes long */ -static int parsepvd(int isofd, char *mediasum, int *skipsectors, long long *isosize, int *supported) { +/* fragmentsums must be a preallocated buffer at least FRAGMENT_SUM_LENGTH+1 bytes long */ +static int parsepvd(int isofd, char *mediasum, int *skipsectors, long long *isosize, int *supported, char *fragmentsums, long long *fragmentcount) { unsigned char buf[2048]; unsigned char buf2[512]; unsigned char tmpbuf[512]; - int skipfnd, md5fnd, supportedfnd; + int skipfnd, md5fnd, supportedfnd, fragsumfnd, fragcntfnd; unsigned int loc; long long offset; unsigned char *p; @@ -54,6 +58,8 @@ md5fnd = 0; skipfnd = 0; + fragsumfnd = 0; + fragcntfnd = 0; supportedfnd = 0; loc = 0; while (loc < 512) { @@ -92,14 +98,46 @@ } else if (!strncmp(buf2 + loc, "RHLISOSTATUS=1", 14)) { *supported = 1; supportedfnd = 1; + for (p=buf2+loc; *p != ';' && loc < 512; p++, loc++); } else if (!strncmp(buf2 + loc, "RHLISOSTATUS=0", 14)) { *supported = 0; supportedfnd = 1; - } else { + for (p=buf2+loc; *p != ';' && loc < 512; p++, loc++); + } else if (!strncmp(buf2 + loc, "FRAGMENT SUMS = ", 16)) { + /* make sure we dont walk off end */ + if ((loc + FRAGMENT_SUM_LENGTH) > 511) + return -1; + + memcpy(fragmentsums, buf2 + loc + 16, FRAGMENT_SUM_LENGTH); + fragmentsums[FRAGMENT_SUM_LENGTH] = '\0'; + fragsumfnd = 1; + loc += FRAGMENT_SUM_LENGTH + 16; + for (p=buf2+loc; *p != ';' && loc < 512; p++, loc++); + } else if (!strncmp(buf2 + loc, "FRAGMENT COUNT = ", 17)) { + char *errptr; + /* make sure we dont walk off end */ + if ((loc + 17) > 511) + return -1; + + loc = loc + 17; + for (p=tmpbuf; buf2[loc] != ';' && loc < 512; p++, loc++) + *p = buf2[loc]; + + *p = '\0'; + + *fragmentcount = strtol(tmpbuf, &errptr, 10); + if (errptr && *errptr) { + return -1; + } else { + fragcntfnd = 1; + } + + for (p=buf2+loc; *p != ';' && loc < 512; p++, loc++); + } else { loc++; } - if ((skipfnd & md5fnd) & supportedfnd) + if ((skipfnd & md5fnd & fragsumfnd & fragcntfnd) & supportedfnd) break; } @@ -119,19 +157,26 @@ /* both strings must be pre-allocated at least 33 chars in length */ static int checkmd5sum(int isofd, char *mediasum, char *computedsum, int quiet) { int nread; - int i; + int i, j; int appdata_start_offset, appdata_end_offset; int nattempt; int skipsectors; int supported; + int current_fragment = 0; + int previous_fragment = 0; unsigned int bufsize = 32768; unsigned char md5sum[16]; + unsigned char fragmd5sum[16]; unsigned int len; unsigned char *buf; long long isosize, offset, pvd_offset, apoff; + char fragmentsums[FRAGMENT_SUM_LENGTH]; + char thisfragsum[FRAGMENT_SUM_LENGTH]; + long long fragmentcount = 0; MD5_CTX md5ctx; + MD5_CTX fragmd5ctx; - if ((pvd_offset = parsepvd(isofd, mediasum, &skipsectors, &isosize, &supported)) < 0) + if ((pvd_offset = parsepvd(isofd, mediasum, &skipsectors, &isosize, &supported, fragmentsums, &fragmentcount)) < 0) return -1; /* printf("Mediasum = %s\n",mediasum); */ @@ -140,6 +185,9 @@ lseek(isofd, 0L, SEEK_SET); MD5_Init(&md5ctx); + if (fragmentcount) + MD5_Init(&fragmd5ctx); + offset = 0; apoff = pvd_offset + APPDATA_OFFSET; @@ -178,7 +226,33 @@ memset(buf+appdata_start_offset, ' ', len); } + /* update overall md5ctx */ MD5_Update(&md5ctx, buf, nread); + if (fragmentcount) { + /* update the fragment md5ctx */ + current_fragment = offset * (fragmentcount+1) / (isosize - skipsectors*2048); + MD5_Update(&fragmd5ctx, buf, nread); + /* if we're onto the next fragment, calculate the previous sum and check */ + if ( current_fragment != previous_fragment ) { + MD5_Final(fragmd5sum, &fragmd5ctx); + *computedsum = '\0'; + j = (current_fragment-1)*FRAGMENT_SUM_LENGTH/fragmentcount; + for (i=0; i<FRAGMENT_SUM_LENGTH/fragmentcount; i++) { + char tmpstr[2]; + snprintf(tmpstr, 2, "%01x", fragmd5sum[i]); + strncat(computedsum, tmpstr, 2); + thisfragsum[i] = fragmentsums[j++]; + } + thisfragsum[j] = '\0'; + /* printf("\nFragment [%i]: %s ?= %s\n", previous_fragment, computedsum, thisfragsum); */ + previous_fragment = current_fragment; + /* Exit immediatiately if current fragment sum is incorrect */ + if (strcmp(thisfragsum, computedsum) != 0) { + free(buf); + return 0; + } + } + } offset = offset + nread; if (!quiet) { @@ -201,7 +275,7 @@ for (i=0; i<16; i++) { char tmpstr[4]; snprintf (tmpstr, 4, "%02x", md5sum[i]); - strcat(computedsum, tmpstr); + strncat(computedsum, tmpstr, 2); } /* printf("mediasum, computedsum = %s %s\n", mediasum, computedsum); */ @@ -210,7 +284,7 @@ return 0; else return 1; - } +} #if 0 @@ -233,8 +307,10 @@ static int doMediaCheck(int isofd, char *mediasum, char *computedsum, long long *isosize, int *supported, int quiet) { int rc; int skipsectors; + long long fragmentcount = 0; + char fragmentsums[FRAGMENT_SUM_LENGTH+1]; - if (parsepvd(isofd, mediasum, &skipsectors, isosize, supported) < 0) { + if (parsepvd(isofd, mediasum, &skipsectors, isosize, supported, fragmentsums, &fragmentcount) < 0) { fprintf(stderr, "Unable to read the disc checksum from the " "primary volume descriptor.\nThis probably " "means the disc was created without adding the " @@ -290,6 +366,8 @@ int isofd; char mediasum[64]; long long isosize; + char fragmentsums[FRAGMENT_SUM_LENGTH+1]; + long long fragmentcount = 0; int supported; int skipsectors; @@ -300,7 +378,7 @@ exit(1); } - if (parsepvd(isofd, mediasum, &skipsectors, &isosize, &supported) < 0) { + if (parsepvd(isofd, mediasum, &skipsectors, &isosize, &supported, fragmentsums, &fragmentcount) < 0) { fprintf(stderr, "%s: Could not get pvd data", file); fprintf(stderr, "\nUnable to read the disc checksum from the " "primary volume descriptor.\nThis probably " @@ -312,4 +390,8 @@ close(isofd); printf("%s: %s\n", file, mediasum); + if ( (strlen(fragmentsums) > 0) && (fragmentcount > 0) ) { + printf("Fragment sums: %s\n", fragmentsums); + printf("Fragment count: %lld\n", fragmentcount); + } } diff -Naur anaconda-10.1.0.2/isomd5sum/libimplantisomd5.c anaconda-10.1.0.2.dustin/isomd5sum/libimplantisomd5.c --- anaconda-10.1.0.2/isomd5sum/libimplantisomd5.c 2005-03-25 09:34:34.000000000 -0500 +++ anaconda-10.1.0.2.dustin/isomd5sum/libimplantisomd5.c 2005-03-25 11:45:43.000000000 -0500 @@ -15,6 +15,12 @@ #define APPDATA_OFFSET 883 #define SIZE_OFFSET 84 +/* Length in characters of string used for fragment md5sum checking */ +#define FRAGMENT_SUM_LENGTH 60 +/* FRAGMENT_COUNT must be an integral divisor or FRAGMENT_SUM_LENGTH */ +/* 60 => 2, 3, 4, 5, 6, 10, 12, 15, 20, or 30 */ +#define FRAGMENT_COUNT 20 + /* number of sectors to ignore at end of iso when computing sum */ #define SKIPSECTORS 15 @@ -87,15 +93,22 @@ int nread; int dirty; int pvd_offset; + int current_fragment = 0; + int previous_fragment = 0; + int nattempt; long long isosize, total; unsigned char md5sum[16]; + unsigned char fragmd5sum[16]; unsigned int loc; - unsigned char buf[2048]; + unsigned int bufsize = 32768; + unsigned char *buf; unsigned char orig_appdata[512]; unsigned char new_appdata[512]; unsigned char mediasum[33]; char md5str[40]; + char fragstr[FRAGMENT_SUM_LENGTH+1]; MD5_CTX md5ctx; + MD5_CTX fragmd5ctx; isofd = open(fname, O_RDWR); @@ -138,18 +151,39 @@ lseek(isofd, 0L, SEEK_SET); MD5_Init(&md5ctx); + MD5_Init(&fragmd5ctx); + *fragstr = '\0'; + buf = malloc(bufsize * sizeof(unsigned char)); total = 0; /* read up to 15 sectors from end, due to problems reading last few */ /* sectors on burned CDs */ while (total < isosize - SKIPSECTORS*2048) { - nread = read(isofd, buf, 2048); + nattempt = MIN(isosize - SKIPSECTORS*2048 - total, bufsize); + nread = read(isofd, buf, nattempt); + if (nread <= 0) break; MD5_Update(&md5ctx, buf, nread); + MD5_Update(&fragmd5ctx, buf, nread); + + /* if we're onto the next fragment, calculate the previous sum and write */ + current_fragment = total * (FRAGMENT_COUNT+1) / (isosize - SKIPSECTORS*2048); + if ( current_fragment != previous_fragment ) { + MD5_Final(fragmd5sum, &fragmd5ctx); + for (i=0; i<FRAGMENT_SUM_LENGTH/FRAGMENT_COUNT; i++) { + char tmpstr[2]; + snprintf(tmpstr, 2, "%01x", fragmd5sum[i]); + strncat(fragstr, tmpstr, 2); + } + /* printf("\nFragment [%i]: %s\n", previous_fragment, fragstr); */ + previous_fragment = current_fragment; + } + total = total + nread; } + free(buf); MD5_Final(md5sum, &md5ctx); @@ -157,12 +191,15 @@ for (i=0; i<16; i++) { char tmpstr[4]; snprintf (tmpstr, 4, "%02x", md5sum[i]); - strcat(md5str, tmpstr); + strncat(md5str, tmpstr, 2); } if (!quiet) { printf("Inserting md5sum into iso image...\n"); printf("md5 = %s\n", md5str); + printf("Inserting fragment md5sums into iso image...\n"); + printf("fragmd5 = %s\n", fragstr); + printf("frags = %d\n", FRAGMENT_COUNT); } /* memcpy(new_appdata, orig_appdata, 512); */ memset(new_appdata, ' ', 512); @@ -171,9 +208,12 @@ loc = writeAppData(new_appdata, "ISO MD5SUM = ", loc); loc = writeAppData(new_appdata, md5str, loc); loc = writeAppData(new_appdata, ";", loc); - snprintf(buf, sizeof(buf), "SKIPSECTORS = %d", SKIPSECTORS); + + buf = malloc(512 * sizeof(unsigned char)); + snprintf(buf, 512, "SKIPSECTORS = %d", SKIPSECTORS); loc = writeAppData(new_appdata, buf, loc); loc = writeAppData(new_appdata, ";", loc); + free(buf); if (supported) { if (!quiet) @@ -187,6 +227,16 @@ loc = writeAppData(new_appdata, ";", loc); + loc = writeAppData(new_appdata, "FRAGMENT SUMS = ", loc); + loc = writeAppData(new_appdata, fragstr, loc); + loc = writeAppData(new_appdata, ";", loc); + + buf = malloc(512 * sizeof(unsigned char)); + snprintf(buf, 512, "FRAGMENT COUNT = %d", FRAGMENT_COUNT); + loc = writeAppData(new_appdata, buf, loc); + loc = writeAppData(new_appdata, ";", loc); + free(buf); + loc = writeAppData(new_appdata, "THIS IS NOT THE SAME AS RUNNING MD5SUM ON THIS ISO!!", loc); i = lseek(isofd, pvd_offset + APPDATA_OFFSET, SEEK_SET);