--- Begin Message ---
Hey,
an update from the last one. A bit of refactoring to make it prettier.
-sv
#!/usr/bin/python -tt
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# take a definitive repomd.xml
# compare it to set of them retrieved from mirrors
# specifically compare the timestamp on the primary.xml on each
# output list of good mirrors for a given path.
# TODO:
# better error handling
# push into a db?
# read from a config file for various mirrors
# config info:
# mirrorlist-input-file
# mirrorlist-output-file
# path-to-repodata
# archlist
# canonical repo's baseurlfor this mirrorlist
# timeout for mirror check
# close-enough time for 'good' mirrors
#use geoip module to figure out country of-origin per-mirror and write
# out per-country lists as well as a global list.
import os
import sys
import re
import exceptions
from cElementTree import iterparse
from urlgrabber.grabber import URLGrabber
from urlgrabber.grabber import URLGrabError
class YumBaseError(exceptions.Exception):
def __init__(self, args=None):
exceptions.Exception.__init__(self)
self.args = args
class RepoMDError(YumBaseError):
def __init__(self, args=None):
YumBaseError.__init__(self)
self.args = args
def ns_cleanup(qn):
if qn.find('}') == -1: return qn
return qn.split('}')[1]
class RepoData:
"""represents anything beneath a <data> tag"""
def __init__(self, elem):
self.type = elem.attrib.get('type')
self.location = (None, None)
self.checksum = (None,None) # type,value
self.openchecksum = (None,None) # type,value
self.timestamp = None
self.parse(elem)
def parse(self, elem):
for child in elem:
child_name = ns_cleanup(child.tag)
if child_name == 'location':
relative = child.attrib.get('href')
base = child.attrib.get('base')
self.location = (base, relative)
elif child_name == 'checksum':
csum_value = child.text
csum_type = child.attrib.get('type')
self.checksum = (csum_type,csum_value)
elif child_name == 'open-checksum':
csum_value = child.text
csum_type = child.attrib.get('type')
self.openchecksum = (csum_type, csum_value)
elif child_name == 'timestamp':
self.timestamp = child.text
class RepoMD:
"""represents the repomd xml file"""
def __init__(self, repoid, srcfile):
"""takes a repoid and a filename for the repomd.xml"""
self.repoid = repoid
self.repoData = {}
if type(srcfile) == type('str'):
# srcfile is a filename string
infile = open(srcfile, 'rt')
else:
# srcfile is a file object
infile = srcfile
parser = iterparse(infile)
try:
for event, elem in parser:
elem_name = ns_cleanup(elem.tag)
if elem_name == "data":
thisdata = RepoData(elem=elem)
self.repoData[thisdata.type] = thisdata
except SyntaxError, e:
raise RepoMDError, "Damaged repomd.xml file"
def fileTypes(self):
"""return list of metadata file types available"""
return self.repoData.keys()
def getData(self, type):
if self.repoData.has_key(type):
return self.repoData[type]
else:
raise RepoMDError, "Error: requested datatype %s not available" % type
def dump(self):
"""dump fun output"""
for ft in self.fileTypes():
thisdata = self.repoData[ft]
print 'datatype: %s' % thisdata.type
print 'location: %s %s' % thisdata.location
print 'timestamp: %s' % thisdata.timestamp
print 'checksum: %s -%s' % thisdata.checksum
print 'open checksum: %s - %s' % thisdata.openchecksum
class MirrorContainer(object):
def __init__(self, url, grabber, archlist):
self.url = url
self.grabber = grabber
self.timestamps = {}
self.archlist = archlist
self.get_timestamp(url)
def get_timestamp(self, url):
url = '%s/repodata/repomd.xml' % url
(suburl, count) = re.subn('\$ARCH', '$BASEARCH', url)
(suburl, count) = re.subn('\$BASEARCH','$basearch', suburl)
for arch in self.archlist:
(finurl, count) = re.subn('\$basearch', arch, suburl)
try:
fo = self.grabber.urlopen(finurl)
except URLGrabError, e:
print 'error on %s' % finurl
else:
p = RepoMD('fooid', fo)
thisdata = p.repoData['primary']
self.timestamps[arch] = thisdata.timestamp
del p
fo.close()
del fo
def main():
### obviously placeholders for stuff that should be read in from files
canonical = 'http://redhat.download.fedoraproject.org/pub/fedora/linux/core/5/$ARCH/os/'
mirrorlist = [ 'http://ftp.fi.muni.cz/pub/linux/fedora-core/5/$ARCH/os/',
'ftp://ftp.tu-chemnitz.de/pub/linux/fedora-core/5/$ARCH/os/',
'ftp://ftp.wsisiz.edu.pl/pub/linux/fedora/linux/core/5/$ARCH/os/',
'http://ftp.ale.org/mirrors/fedora/linux/core/5/$ARCH/os/',
'http://ftp.uninett.no/pub/linux/Fedora/core/5/$ARCH/os/',
'http://ftp.tu-chemnitz.de/pub/linux/fedora-core/5/$ARCH/os/',
'http://sunsite.informatik.rwth-aachen.de/ftp/pub/linux/fedora-core/5/$ARCH/os/',
'ftp://ftp.tecnoera.com/pub/fedora/linux/core/5/$ARCH/os/',
'ftp://redhat.taygeta.com/pub/RedHat/fedora/core/5/$ARCH/os/',
'http://fr2.rpmfind.net/linux/fedora/core/5/$ARCH/os/',
'http://ftp.riken.jp/Linux/fedora/core/5/$ARCH/os/']
archlist = ['i386','ppc', 'x86_64']
mirrors = []
timeout=10
### end
# setup our urlgrabber
ug = URLGrabber(timeout=timeout)
# grab the canonical mirrors info
canon = MirrorContainer(canonical, ug, archlist)
if len(canon.timestamps.keys()) < len(archlist):
# if we can't get info for all arches for the canonical mirror, exit
print "Cannot contact canonical host for all archs, exiting"
sys.exit(1)
# debug only - just printing out info
for arch in archlist:
if canon.timestamps.has_key(arch):
print ' %s: %s' % (arch, canon.timestamps[arch])
# get the info for all the mirrors
for url in mirrorlist:
m = MirrorContainer(url, ug, archlist)
if m:
mirrors.append(m)
# print them out per-arch
for arch in archlist:
print arch
for m in mirrors:
if m.timestamps.has_key(arch):
if m.timestamps[arch] == canon.timestamps[arch]:
print ' %s' % m.url
if __name__ == '__main__':
main()
--- End Message ---