[Fedora-infrastructure-list] Fix them mirrors

Mike McGrath <mmcgrath@xxxxxxxxxxxxxxxxx> · Thu, 06 Jul 2006 15:37:15 -0500

We should implement this.  We need to find a good place in CVS for it.

   -Mike

--- Begin Message ---

Subject: some more code
From: seth vidal <skvidal@xxxxxxxxxxxxxx>
Date: Wed, 05 Jul 2006 11:45:22 -0400
Delivered-to: imlinux@xxxxxxxxx

Hey,
 an update from the last one. A bit of refactoring to make it prettier.

-sv

#!/usr/bin/python -tt
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

# take a definitive repomd.xml
# compare it to set of them retrieved from mirrors
# specifically compare the timestamp on the primary.xml on each

# output list of good mirrors for a given path.

# TODO:
# better error handling
# push into a db?
# read from a config file for various mirrors
#   config info:
#    mirrorlist-input-file
#    mirrorlist-output-file
#    path-to-repodata
#    archlist
#    canonical repo's baseurlfor this mirrorlist
#    timeout for mirror check
#    close-enough time for 'good' mirrors
#use geoip module to figure out country of-origin per-mirror and write
#   out per-country lists as well as a global list.

import os
import sys
import re
import exceptions
from cElementTree import iterparse

from urlgrabber.grabber import URLGrabber
from urlgrabber.grabber import URLGrabError

class YumBaseError(exceptions.Exception):
    def __init__(self, args=None):
        exceptions.Exception.__init__(self)    
        self.args = args

class RepoMDError(YumBaseError):
    def __init__(self, args=None):
        YumBaseError.__init__(self)
        self.args = args

def ns_cleanup(qn):
    if qn.find('}') == -1: return qn 
    return qn.split('}')[1]

class RepoData:
    """represents anything beneath a <data> tag"""
    def __init__(self, elem):
        self.type = elem.attrib.get('type')
        self.location = (None, None)
        self.checksum = (None,None) # type,value
        self.openchecksum = (None,None) # type,value
        self.timestamp = None

        self.parse(elem)

    def parse(self, elem):

        for child in elem:
            child_name = ns_cleanup(child.tag)
            if child_name == 'location':
                relative = child.attrib.get('href')
                base = child.attrib.get('base')
                self.location = (base, relative)

            elif child_name == 'checksum':
                csum_value = child.text
                csum_type = child.attrib.get('type')
                self.checksum = (csum_type,csum_value)

            elif child_name == 'open-checksum':
                csum_value = child.text
                csum_type = child.attrib.get('type')
                self.openchecksum = (csum_type, csum_value)

            elif child_name == 'timestamp':
                self.timestamp = child.text

class RepoMD:
    """represents the repomd xml file"""

    def __init__(self, repoid, srcfile):
        """takes a repoid and a filename for the repomd.xml"""

        self.repoid = repoid
        self.repoData = {}

        if type(srcfile) == type('str'):
            # srcfile is a filename string
            infile = open(srcfile, 'rt')
        else:
            # srcfile is a file object
            infile = srcfile

        parser = iterparse(infile)

        try:
            for event, elem in parser:
                elem_name = ns_cleanup(elem.tag)

                if elem_name == "data":
                    thisdata = RepoData(elem=elem)
                    self.repoData[thisdata.type] = thisdata
        except SyntaxError, e:
            raise RepoMDError, "Damaged repomd.xml file"

    def fileTypes(self):
        """return list of metadata file types available"""
        return self.repoData.keys()

    def getData(self, type):
        if self.repoData.has_key(type):
            return self.repoData[type]
        else:
            raise RepoMDError, "Error: requested datatype %s not available" % type

    def dump(self):
        """dump fun output"""

        for ft in self.fileTypes():
            thisdata = self.repoData[ft]
            print 'datatype: %s' % thisdata.type
            print 'location: %s %s' % thisdata.location
            print 'timestamp: %s' % thisdata.timestamp
            print 'checksum: %s -%s' % thisdata.checksum
            print 'open checksum: %s - %s' %  thisdata.openchecksum

class MirrorContainer(object):
    def __init__(self, url, grabber, archlist):
        self.url = url
        self.grabber = grabber
        self.timestamps = {}
        self.archlist = archlist
        self.get_timestamp(url)

    def get_timestamp(self, url):
        url = '%s/repodata/repomd.xml' % url
        (suburl, count) = re.subn('\$ARCH', '$BASEARCH', url)
        (suburl, count) = re.subn('\$BASEARCH','$basearch', suburl)

        for arch in self.archlist:
            (finurl, count) = re.subn('\$basearch', arch, suburl)
            try:
                fo = self.grabber.urlopen(finurl)
            except URLGrabError, e:
                print 'error on %s' % finurl
            else:
                p = RepoMD('fooid', fo)
                thisdata = p.repoData['primary']
                self.timestamps[arch] = thisdata.timestamp
                del p
                fo.close()
                del fo

def main():
    ### obviously placeholders for stuff that should be read in from files
    canonical = 'http://redhat.download.fedoraproject.org/pub/fedora/linux/core/5/$ARCH/os/'
    mirrorlist = [ 'http://ftp.fi.muni.cz/pub/linux/fedora-core/5/$ARCH/os/',
    'ftp://ftp.tu-chemnitz.de/pub/linux/fedora-core/5/$ARCH/os/',
    'ftp://ftp.wsisiz.edu.pl/pub/linux/fedora/linux/core/5/$ARCH/os/',
    'http://ftp.ale.org/mirrors/fedora/linux/core/5/$ARCH/os/',
    'http://ftp.uninett.no/pub/linux/Fedora/core/5/$ARCH/os/',
    'http://ftp.tu-chemnitz.de/pub/linux/fedora-core/5/$ARCH/os/',
    'http://sunsite.informatik.rwth-aachen.de/ftp/pub/linux/fedora-core/5/$ARCH/os/',
    'ftp://ftp.tecnoera.com/pub/fedora/linux/core/5/$ARCH/os/',
    'ftp://redhat.taygeta.com/pub/RedHat/fedora/core/5/$ARCH/os/',
    'http://fr2.rpmfind.net/linux/fedora/core/5/$ARCH/os/',
    'http://ftp.riken.jp/Linux/fedora/core/5/$ARCH/os/']
    archlist = ['i386','ppc', 'x86_64']
    mirrors = []
    timeout=10
    ### end

    # setup our urlgrabber
    ug = URLGrabber(timeout=timeout)    
    # grab the canonical mirrors info
    canon = MirrorContainer(canonical, ug, archlist)
    if len(canon.timestamps.keys()) < len(archlist):
        # if we can't get info for all arches for the canonical mirror, exit
        print "Cannot contact canonical host for all archs, exiting" 
        sys.exit(1)

    # debug only - just printing out info
    for arch in archlist:
        if canon.timestamps.has_key(arch):
            print '  %s: %s' % (arch, canon.timestamps[arch])

    # get the info for all the mirrors
    for url in mirrorlist:
        m = MirrorContainer(url, ug, archlist)
        if m:
            mirrors.append(m)

    # print them out per-arch
    for arch in archlist:
        print arch
        for m in mirrors:
            if m.timestamps.has_key(arch):
                if m.timestamps[arch] == canon.timestamps[arch]:
                    print '  %s' % m.url

if __name__ == '__main__':
    main()

--- End Message ---