Re: [Fedora-infrastructure-list] Fix them mirrors

seth vidal <skvidal@xxxxxxxxxxxxxx> · Fri, 07 Jul 2006 17:21:59 -0400

On Thu, 2006-07-06 at 15:37 -0500, Mike McGrath wrote:
> We should implement this.  We need to find a good place in CVS for it.
> 

More fun:

foolist is just the mirrorlist for core as I pulled from the url in a
yum .repo file in fc5

the config file is for the python script and it is passed as the
argument to the script.

run it and it will output per-country and a global up to date mirrorlist
into whatever outputpath is defined as in the conf file.

I'll try to tidy this up further and get the cgi worked out for the
remote requests soon.

-sv

#!/usr/bin/python -tt
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

# take a definitive repomd.xml
# compare it to set of them retrieved from mirrors
# specifically compare the timestamp on the primary.xml on each

# output list of good mirrors for a given path.

debug = True

# TODO:
# better error handling
# push into a db?
# read from a config file for various mirrors
#   config info:
#    mirrorlist-input-file
#    mirrorlist-output-path
#    path-to-repodata
#    archlist
#    canonical repo's baseurlfor this mirrorlist
#    timeout for mirror check
#    close-enough time for 'good' mirrors
#use geoip module to figure out country of-origin per-mirror and write
#   out per-country lists as well as a global list.

import os
import sys
import re
import exceptions
from cElementTree import iterparse
import GeoIP
import ConfigParser
import socket
import urlparse

from urlgrabber.grabber import URLGrabber
from urlgrabber.grabber import URLGrabError

class YumBaseError(exceptions.Exception):
    def __init__(self, args=None):
        exceptions.Exception.__init__(self)    
        self.args = args

class RepoMDError(YumBaseError):
    def __init__(self, args=None):
        YumBaseError.__init__(self)
        self.args = args

def ns_cleanup(qn):
    if qn.find('}') == -1: return qn 
    return qn.split('}')[1]

def errorprint(stuff):
    print >> sys.stderr, stuff

def check_and_make_dir(dir):
    """
     check out the dir and make it, if possible, return 1 if done, else return 0
    """
    if os.path.exists(dir):
        if not os.path.isdir(dir):
            errorprint('%s is not a dir' % dir)
            result = False
        else:
            if not os.access(dir, os.W_OK):
                errorprint('%s is not writable' % dir)
                result = False
            else:
                result = True
    else:
        try:
            os.mkdir(dir)
        except OSError, e:
            errorprint('Error creating dir %s: %s' % (dir, e))
            result = False
        else:
            result = True
    return result

class RepoData:
    """represents anything beneath a <data> tag"""
    def __init__(self, elem):
        self.type = elem.attrib.get('type')
        self.location = (None, None)
        self.checksum = (None,None) # type,value
        self.openchecksum = (None,None) # type,value
        self.timestamp = None

        self.parse(elem)

    def parse(self, elem):

        for child in elem:
            child_name = ns_cleanup(child.tag)
            if child_name == 'location':
                relative = child.attrib.get('href')
                base = child.attrib.get('base')
                self.location = (base, relative)

            elif child_name == 'checksum':
                csum_value = child.text
                csum_type = child.attrib.get('type')
                self.checksum = (csum_type,csum_value)

            elif child_name == 'open-checksum':
                csum_value = child.text
                csum_type = child.attrib.get('type')
                self.openchecksum = (csum_type, csum_value)

            elif child_name == 'timestamp':
                self.timestamp = child.text

class RepoMD:
    """represents the repomd xml file"""

    def __init__(self, repoid, srcfile):
        """takes a repoid and a filename for the repomd.xml"""

        self.repoid = repoid
        self.repoData = {}

        if type(srcfile) == type('str'):
            # srcfile is a filename string
            infile = open(srcfile, 'rt')
        else:
            # srcfile is a file object
            infile = srcfile

        parser = iterparse(infile)

        try:
            for event, elem in parser:
                elem_name = ns_cleanup(elem.tag)

                if elem_name == "data":
                    thisdata = RepoData(elem=elem)
                    self.repoData[thisdata.type] = thisdata
        except SyntaxError, e:
            raise RepoMDError, "Damaged repomd.xml file"

    def fileTypes(self):
        """return list of metadata file types available"""
        return self.repoData.keys()

    def getData(self, type):
        if self.repoData.has_key(type):
            return self.repoData[type]
        else:
            raise RepoMDError, "Error: requested datatype %s not available" % type

    def dump(self):
        """dump fun output"""

        for ft in self.fileTypes():
            thisdata = self.repoData[ft]
            print 'datatype: %s' % thisdata.type
            print 'location: %s %s' % thisdata.location
            print 'timestamp: %s' % thisdata.timestamp
            print 'checksum: %s -%s' % thisdata.checksum
            print 'open checksum: %s - %s' %  thisdata.openchecksum

class MirrorContainer(object):
    def __init__(self, url, grabber, archlist, gi):
        self.url = url
        self.grabber = grabber
        self.geoip = gi
        self.timestamps = {}
        self.archlist = archlist
        self.country = None
        self.get_timestamp(url)
        self.get_country(url)

    def get_timestamp(self, url):
        url = '%s/repodata/repomd.xml' % url
        (suburl, count) = re.subn('\$ARCH', '$BASEARCH', url)
        (suburl, count) = re.subn('\$BASEARCH','$basearch', suburl)

        for arch in self.archlist:
            (finurl, count) = re.subn('\$basearch', arch, suburl)
            try:
                fo = self.grabber.urlopen(finurl)
            except URLGrabError, e:
                print 'error on %s' % finurl
                continue

            try:
                p = RepoMD('fooid', fo)
            except RepoMDError, e:
                print e
                continue
            else:
                thisdata = p.repoData['primary']
                self.timestamps[arch] = thisdata.timestamp
                del p
                fo.close()
                del fo

    def get_country(self, url):
        # unparse url
        # resolve out ip
        # get county by addr

        url_parts = urlparse.urlparse(url)
        h = url_parts[1]
        addr = socket.gethostbyname(h)
        self.country = self.geoip.country_code_by_addr(addr)

class MirrorListInfo(object):
    def __init__(self):
        self.archlist = ['i386', 'x86_64', 'ppc']
        self.mirrorid = None
        self.inputfile = None
        self.outputpath = None
        self.timeout = 10
        self.canonical = None
        self.mirrorlist = []

    def populate_mirrorlist(self):
        try:
            fo = open(self.inputfile, 'r')
        except IOError, e:
            return
        else:
            for url in fo.readlines():
                url = url.replace('\n','')
                self.mirrorlist.append(url)

            fo.close()

def config(cfg):

    sections = []
    conf = ConfigParser.ConfigParser()
    conf.read(cfg)

    for section in conf.sections():
        item = MirrorListInfo()
        item.mirrorid = '%s' % section
        broken = False

        if conf.has_option(section, 'inputfile'):
            item.inputfile = conf.get(section, 'inputfile')
        else:
            errorprint('missing inputfile')
            broken = True

        if conf.has_option(section, 'outputpath'):
            item.outputpath = conf.get(section, 'outputpath')
        else:
            errorprint('missing outputpath')
            broken = True

        if conf.has_option(section, 'canonical'):
            item.canonical = conf.get(section, 'canonical')
        else:
            errorprint('missing canonical url')
            broken = True

        if broken:
            errorprint("Broooooooooooooken config, in section %s, bailing" % section)
            sys.exit(1)

        if conf.has_option(section, 'timeout'):
            item.timeout = conf.getint(section, 'timeout')

        if conf.has_option(section, 'archlist'):
            a_string = conf.get(section, 'archlist')

            a_holder = a_string.replace('\n', ' ')
            a_holder = a_holder.replace(',', ' ')
            a_list = a_holder.split()

            item.archlist = a_list

        sections.append(item)

    return sections

def main(cfg_file):

    sections = config(cfg_file)
    mirrors = []
    gi = GeoIP.new(GeoIP.GEOIP_STANDARD)

    # grab the canonical mirrors info
    for s in sections:    

        s.populate_mirrorlist()
        if len(s.mirrorlist) < 1:
            errorprint("no mirrors to look at for %s, something is broken, skipping" % s.mirrorid)
            continue

        if not check_and_make_dir(s.outputpath):
            errorprint('Error creating output path %s for %s' % (s.outputpath, s.mirrorid))
            continue

        ug = URLGrabber(timeout=s.timeout)
        canon = MirrorContainer(s.canonical, ug, s.archlist, gi)
        if len(canon.timestamps.keys()) < len(s.archlist):
            # if we can't get info for all arches for the canonical mirror, exit
            errorprint("Cannot contact canonical host for all archs, skipping")
            continue

        if debug:
            # debug only - just printing out info
            for arch in s.archlist:
                if canon.timestamps.has_key(arch):
                    print '  %s: %s' % (arch, canon.timestamps[arch])

        # get the info for all the mirrors        

        for url in s.mirrorlist:
            m = MirrorContainer(url, ug, s.archlist, gi)
            if m:
                mirrors.append(m)

        # output should be:
        # s.outputpath/s.mirrorid-$country-$arch.txt
        # s.outputpath/s.mirrorid-global-$arch.txt

        # print them out per-arch
        for arch in s.archlist:
            glob = '%s/%s-global-%s.txt' % (s.outputpath, s.mirrorid, arch)
            glob_fo = open(glob, 'w+')
            for m in mirrors:
                if m.timestamps.has_key(arch):
                    if m.timestamps[arch] == canon.timestamps[arch]:
                        glob_fo.write('%s\n' % m.url)
                    if m.country:
                        country = '%s/%s-%s-%s.txt' % (s.outputpath, s.mirrorid, m.country, arch)
                        country_fo = open(country, 'a')
                        country_fo.write('%s\n' % m.url)
                        country_fo.close()
            glob_fo.close()

if __name__ == '__main__':
    main(sys.argv[1])

#[extras-5]
#inputfile = /tmp/global-mirrors-extras-5
#outputpath = /tmp/mirrors-extras-5
#timeout = 10
#canonical = http://redhat.download.fedoraproject.org/pub/fedora/linux/extras/5/$ARCH/os/

[core-5]
inputfile = /tmp/foolist
outputpath = /tmp/foopath/
archlist = i386, x86_64, ppc
timeout = 10
canonical = http://redhat.download.fedoraproject.org/pub/fedora/linux/core/5/$ARCH/os/
http://redhat.download.fedoraproject.org/pub/fedora/linux/core/5/$ARCH/os/
http://ftp.fi.muni.cz/pub/linux/fedora-core/5/$ARCH/os/
ftp://ftp.tu-chemnitz.de/pub/linux/fedora-core/5/$ARCH/os/
ftp://ftp.wsisiz.edu.pl/pub/linux/fedora/linux/core/5/$ARCH/os/
http://ftp.ale.org/mirrors/fedora/linux/core/5/$ARCH/os/
http://ftp.uninett.no/pub/linux/Fedora/core/5/$ARCH/os/
http://ftp.tu-chemnitz.de/pub/linux/fedora-core/5/$ARCH/os/
http://sunsite.informatik.rwth-aachen.de/ftp/pub/linux/fedora-core/5/$ARCH/os/
ftp://ftp.tecnoera.com/pub/fedora/linux/core/5/$ARCH/os/
ftp://redhat.taygeta.com/pub/RedHat/fedora/core/5/$ARCH/os/
http://fr2.rpmfind.net/linux/fedora/core/5/$ARCH/os/
http://ftp.riken.jp/Linux/fedora/core/5/$ARCH/os/
http://zeniv.linux.org.uk/pub/distributions/fedora/linux/core/5/$ARCH/os/
http://zeniiia.linux.org.uk/pub/distributions/fedora/linux/core/5/$ARCH/os/
ftp://ftp.wicks.co.nz/pub/linux/dist/fedora/5/$ARCH/os/
ftp://ftp.rhd.ru/pub/fedora/linux/core/5/$ARCH/os/
http://ftp.rhd.ru/pub/fedora/linux/core/5/$ARCH/os/
ftp://ftp.ipex.cz/pub/linux/fedora/core/5/$ARCH/os/
http://fedora.cat.pdx.edu/linux/core/5/$ARCH/os/
http://fedora.ngi.it/5/$ARCH/os/
ftp://falkor.skane.se/pub/mirrors/fedora/core/5/$ARCH/os/
ftp://ftp.cica.es/fedora/linux/core/5/$ARCH/os/
ftp://ftp.free.fr/mirrors/fedora.redhat.com/fedora/linux/core/5/$ARCH/os/
http://ftp.ussg.iu.edu/linux/fedora/linux/core/5/$ARCH/os/
http://ftp.surfnet.nl/ftp/pub/os/Linux/distr/fedora/5/$ARCH/os/
http://ftp.nluug.nl/ftp/pub/os/Linux/distr/fedora/5/$ARCH/os/
ftp://ftp.net.usf.edu/pub/fedora/linux/core/5/$ARCH/os/
http://www.muug.mb.ca/pub/fedora/linux/core/5/$ARCH/os/
http://mirror.eas.muohio.edu/fedora/linux/core/5/$ARCH/os/
http://sunsite.mff.cuni.cz/pub/fedora/5/$ARCH/os/
http://mirror.linux.duke.edu/pub/fedora/linux/core/5/$ARCH/os/
http://distro.ibiblio.org/pub/linux/distributions/fedora/linux/core/5/$ARCH/os/
http://mirror.hiwaay.net/redhat/fedora/linux/core/5/$ARCH/os/
ftp://mirrors.hpcf.upr.edu/pub/Mirrors/redhat/download.fedora.redhat.com/5/$ARCH/os/
http://redhat.secsup.org/fedora/core/5/$ARCH/os/
ftp://ftp.dc.aleron.net/pub/linux/fedora/linux/core/5/$ARCH/os/
ftp://mirror.newnanutilities.org/pub/fedora/linux/core/5/$ARCH/os/
ftp://ftp.software.umn.edu/pub/linux/fedora/core/5/$ARCH/os/
http://www.gtlib.cc.gatech.edu/pub/fedora.redhat/linux/core/5/$ARCH/os/
ftp://fedora.mirrors.tds.net/pub/fedora-core/5/$ARCH/os/
http://mirror.cs.wisc.edu/pub/mirrors/linux/download.fedora.redhat.com/pub/fedora/linux/core/5/$ARCH/os/
http://ftp.ndlug.nd.edu/pub/fedora/linux/core/5/$ARCH/os/
http://fedora.server4you.net/fedora/core/5/$ARCH/os/
ftp://mirrors.ptd.net/fedora/core/5/$ARCH/os/
ftp://fedora.bu.edu/fedora/core/5/$ARCH/os/
http://mirror.pacific.net.au/linux/fedora/linux/core/5/$ARCH/os/