Hi Seth, Any objections to applying this patch for hardlinking in reposync ? On Sun, Jul 15, 2007 at 07:54:28PM +0100, Daniel P. Berrange wrote: > On Wed, May 30, 2007 at 05:36:03PM -0400, seth vidal wrote: > > On Wed, 2007-05-30 at 19:15 +0100, Daniel P. Berrange wrote: > > > When syncing multiple repositories across many architectures there will be > > > a fairly large number of duplicate RPMs present in all repositories. This > > > is particularly true of x86_64 multilib distros which share most of the > > > i386 packages. Downloading many GB of packages for x86_64 repo which are > > > already present in a i386 synced repo is wasteful of time, disk space and > > > bandwidth. So this patch adds an extra argument --package-cache which takes > > > a directory path. This argument can be repeated multiple times. Before > > > downloading a package, reposync will check each of these directories to see > > > if they already contain a copy of the package. If so, the local package will > > > be hard-linked into the destdir. > > > > > > Typical usage would be to list the i386 repo cache, when running a sync of > > > the x86_64 repository, and vica-verca. > > > > > > To give an idea of the saving, Fedora rawhide currently has ~7400 rpms in > > > i386 repos, and 13,000 in x86_64 trees. ~6900 of these RPMs where present > > > in both trees. Hardlinking saved 8 GB of disk space & avoid 8 GB of file > > > downloads :-) > > > > > > > This patch looks good, but I'd be happier if you would check the > > checksum from the package to make sure it matches the remote metadata > > instead of just relying on name, size, etc. > > Totally lost track of fact I still needed to re-work this patch :-( Anyway > I've changed the way it works now. First there isa new --checksum argument > which enables verification of package checksums (either SHA or MD5 - takes > whichever is in the repo metadata). > > There were actually 3 scenarios where we should be verifying packages > > 1. If we are hardlinking in from another directory > 2. If we already have a copy in our local directory > 3. After we downloaded a new RPM > > The existing code was only ever checking package size in step 2, and only > ever checking GPG key in step 3. So I factored out a separate 'checkPkg' > method, and all 3 of those locations will check at the very least the file > size. If file size matches, then it will also check GPG signature and/or > checksums if appropriate command line args are used. > > Adding checksum verification turned out to be very worthwhile - turns out > I had a fair number of previously downloaded RPMs which had non-matching > checksums wrt to repo metadata. > > Attaching the updated patch. Regards, Dan. -- |=- Red Hat, Engineering, Emerging Technologies, Boston. +1 978 392 2496 -=| |=- Perl modules: http://search.cpan.org/~danberr/ -=| |=- Projects: http://freshmeat.net/~danielpb/ -=| |=- GnuPG: 7D3B9505 F3C9 553F A1DA 4AC2 5648 23C1 B3DF F742 7D3B 9505 -=|
--- reposync.py 2007-06-05 14:04:02.000000000 -0400 +++ /home/berrange/usr/bin/reposync 2007-07-15 13:37:23.000000000 -0400 @@ -47,6 +47,7 @@ import rpmUtils.arch import logging from urlgrabber.progress import TextMeter +from urlgrabber.grabber import URLGrabError # for yum 2.4.X compat def sortPkgObj(pkg1 ,pkg2): @@ -103,10 +104,14 @@ help="Use a temp dir for storing/accessing yum-cache") parser.add_option("-d", "--delete", default=False, action="store_true", help="delete local packages no longer present in repository") + parser.add_option("-k", "--package-cache", default=[], dest='pkgcache', action='append', + help="additional directory to search for pre-existing packages") parser.add_option("-p", "--download_path", dest='destdir', default=os.getcwd(), help="Path to download packages to: defaults to current dir") parser.add_option("-g", "--gpgcheck", default=False, action="store_true", help="Remove packages that fail GPG signature checking after downloading") + parser.add_option("-s", "--checksum", default=False, action="store_true", + help="Checksum local downloaded packages to verify integrity") parser.add_option("-u", "--urls", default=False, action="store_true", help="Just list urls of what would be downloaded, don't download") parser.add_option("-n", "--newest-only", dest='newest', default=False, action="store_true", @@ -118,6 +123,36 @@ return (opts, args) +def checkPkg(opts, my, pkg, local): + # Size check first since it is fast + if os.path.getsize(local) != int(pkg.returnSimple('packagesize')): + return False, "size difference" + + # Next up a checksum + if opts.checksum: + try: + sums = pkg.returnChecksums() + if len(sums) > 0: + my.verifyChecksum(local, sums[0][0], sums[0][1]) + except URLGrabError, e: + return False, "failed checksum" + + # Finally a gpg check + if opts.gpgcheck: + # Hack to make sig check go against our localpath + pkg.localpath = local + result, error = my.sigCheckPkg(pkg) + if result != 0: + if result == 1: + return False, "missing GPG key" + elif result == 2: + return False, "failed signature" + else: + return False, "failed signature %s" % error + + return True, "" + + def main(): (opts, junk) = parseArgs() @@ -135,6 +170,14 @@ my = RepoSync(opts=opts) my.doConfigSetup(fn=opts.config, init_plugins=False) + # Populate cache of existing download RPMs from other + # repositories we can link to + pkgcache = {} + for dir in opts.pkgcache: + cache = localpkgs(dir) + for k in cache.keys(): + pkgcache[k] = cache[k] + # Force unprivileged users to have a private temporary cachedir # if they've not given an explicit cachedir if os.getuid() != 0 and not opts.cachedir: @@ -183,7 +226,18 @@ download_list = list(reposack) local_repo_path = opts.destdir + '/' + repo.id - if opts.delete and os.path.exists(local_repo_path): + # make sure the repo subdir is here before we go on. + if not os.path.exists(local_repo_path): + try: + os.makedirs(local_repo_path) + except IOError, e: + my.logger.error("Could not make repo subdir: %s" % e) + my.closeRpmDB() + sys.exit(1) + + # Check if there's any local files no longer on the remote + # repo which need purging + if opts.delete: current_pkgs = localpkgs(local_repo_path) download_set = {} @@ -200,62 +254,65 @@ my.logger.info("Removing obsolete %s", pkg) os.unlink(current_pkgs[pkg]['path']) + download_list.sort(sortPkgObj) n = 0 for pkg in download_list: n = n + 1 repo = my.repos.getRepo(pkg.repoid) remote = pkg.returnSimple('relativepath') + rpmname = os.path.basename(remote) local = local_repo_path + '/' + remote localdir = os.path.dirname(local) + if not os.path.exists(localdir): os.makedirs(localdir) - if (os.path.exists(local) and - os.path.getsize(local) == int(pkg.returnSimple('packagesize'))): - - - if not opts.quiet: - my.logger.error("[%s: %-5d of %-5d ] Skipping existing %s" % (repo.id, n, len(download_list), remote)) - continue - + # If there's already a local copy, check it + if os.path.exists(local): + matching, err = checkPkg(opts, my, pkg, local) + if matching: + my.logger.info("[%s: %-5d of %-5d ] Skipping existing %s" % (repo.id, n, len(download_list), remote)) + continue + else: + if not opts.quiet: + my.logger.info("[%s: %-5d of %-5d ] Removing existing %s due to %s" % (repo.id, n, len(download_list), remote, err)) + os.unlink(local) + + # If another dir has a local copy, and it is on + # the same storage device, and verification passes, + # then we can hardlink it into local dir. + if not os.path.exists(local) and \ + pkgcache.has_key(rpmname) and \ + os.stat(local_repo_path).st_dev == pkgcache[rpmname]['device']: + matching, err = checkPkg(opts, my, pkg, pkgcache[rpmname]['path']) + if matching: + if not opts.quiet: + my.logger.info("[%s: %-5d of %-5d ] Linking existing %s" % (repo.id, n, len(download_list), remote)) + os.link(pkgcache[rpmname]['path'], local) + continue + + # If we're just printing URLs, skip to next repo if opts.urls: url = urljoin(repo.urls[0],remote) print '%s' % url continue - - # make sure the repo subdir is here before we go on. - if not os.path.exists(local_repo_path): - try: - os.makedirs(local_repo_path) - except IOError, e: - my.logger.error("Could not make repo subdir: %s" % e) - my.closeRpmDB() - sys.exit(1) - - # Disable cache otherwise things won't download + + # Disable cache otherwise things won't download repo.cache = 0 if not opts.quiet: my.logger.info( '[%s: %-5d of %-5d ] Downloading %s' % (repo.id, n, len(download_list), remote)) - pkg.localpath = local # Hack: to set the localpath we want. - path = repo.getPackage(pkg) - if opts.gpgcheck: - result, error = my.sigCheckPkg(pkg) - if result != 0: - if result == 1: - my.logger.warning('Removing %s, due to missing GPG key.' % os.path.basename(remote)) - elif result == 2: - my.logger.warning('Removing %s due to failed signature check.' % os.path.basename(remote)) - else: - my.logger.warning('Removing %s due to failed signature check: %s' % (os.path.basename(remote), error)) - os.unlink(path) - continue - if not os.path.exists(local) or not os.path.samefile(path, local): - shutil.copy2(path, local) + pkg.localpath = local # Hack: to set the localpath we want. + repo.getPackage(pkg) + matching, err = checkPkg(opts, my, pkg, local) + if not matching: + my.logger.warning("[%s: %-5d of %-5d ] Removing failed download %s due to %s", (repo.id, n, len(download_list), remote, err)) + os.unlink(path) + continue my.closeRpmDB() if __name__ == "__main__": main() - +
_______________________________________________ Yum mailing list Yum@xxxxxxxxxxxxxxxxxxxx https://lists.dulug.duke.edu/mailman/listinfo/yum