Re: archive-installer backend: review applied

David Cantrell <dcantrell@xxxxxxxxxx> · Fri, 15 Jul 2011 09:53:42 -0400

On 07/14/2011 11:03 PM, Michael K. Johnson wrote:
Thanks again to David for the review.  Here's what I've done this
evening.  Tested working installs and a bunch of failure cases,
including bad URLs, unsupported archive types, and corrupted
archives.

I like this patch, but have some comments as well.  See below.

#
# archive.py: An anaconda backend to install from a system archive
#
# The intent is to be able to install an archive (or set of archives)
# similarly to a livecd install, except that there is no need
# to move files around afterward to handle multiple filesystems,
# or to resize a filesystem.  This archive could be located at a
# network location or be on install media.  The archive is assumed
# to contain all package-managed content.
#
# Copyright (C) 2011 Michael K Johnson.  All rights reserved.
# Copyright (C) 2007 Red Hat, Inc.  All rights reserved.

Lose the 'All rights reserved.' here.  If it's in our file, that's 
wrong.  I've received talkings-to about this.  Bottom line is we need to 
not include the 'All rights reserved.' statement.

#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see<http://www.gnu.org/licenses/>.
#
# Author(s): Michael K Johnson<a1237@xxxxxxxxx>
#

import os
import signal
import stat
import subprocess
import sys
from urlgrabber.grabber import URLGrabber, URLGrabError

import storage

from constants import *

import gettext
_ = lambda x: gettext.ldgettext("anaconda", x)

import backend
import isys
import iutil

import network
import packages

import logging
log = logging.getLogger("anaconda")

class ProgressCallback(object):
     def __init__(self, progress):
         self.progress = progress
     def update(self, current, total=None):

Style comment:  Blank line after a function definition and before the 
start of a new one.

         if total is not None:
             self.progress.set_fraction(current / float(total))
             total = '%d' % (total / 1024)
         if total is None:
             total = 'unknown'
         self.progress.set_label('Unpacked %d of %s KiB' % (
                                           current / 1024, total))
         self.progress.processEvents()

class ChunkedData(object):
     def __init__(self, fobj, sourceObj):
         self.fobj = fobj
         self.sourceObj = sourceObj
     def __iter__(self):
         return self
     def next(self):
         data = self.fobj.read(1024*1024)
         if not data:
             raise StopIteration

         self.sourceObj.update(len(data))

         return data

class MissingUtility(RuntimeError):
     def __init__(self, utilityName, formatName, *args, **kw):
         self.utilityName = utilityName
         self.formatName = formatName
         RuntimeError.__init__(self, *args, **kw)

class FailedUtility(RuntimeError):
     def __init__(self, message, *args, **kw):
         self.args = args
         self.message = message
         RuntimeError.__init__(self, *args, **kw)

class InvalidRepository(RuntimeError):
     def __init__(self, message, *args, **kw):
         self.args = args
         self.message = message
         RuntimeError.__init__(self, *args, **kw)

class MissingNetwork(RuntimeError):
     pass

def archiveFormat(filename):
     # this should be updated with whatever archive and compression
     # formats are supported in the future
     formatkeys = (
         (('.tar.gz', '.tgz', '.tar.Z'), ('tar', 'gz')),
         (('.tar.bz2', '.tbz2'), ('tar', 'bz2')),
         (('.tar.xz', 'txz'), ('tar', 'xz')),
         (('.tar',), ('tar', '')),
         (('.cpio.gz', '.cpio.Z'), ('cpio', 'gz')),
         (('.cpio.bz2',), ('cpio', 'bz2')),
         (('.cpio.xz',), ('cpio', 'xz')),
         (('.cpio',), ('cpio', '')),
     )
     for extensions, formats in formatkeys:
         for extension in extensions:
             if filename.endswith(extension):
                 return formats
     return False

How about using the 'magic' module (yum install python-magic) here 
instead of keying on specific file name endings?  The magic module can 
give you the description that file(1) would print out or the mime type. 
 I think the mime type would be useful here and would prevent us from 
having users file bugs when they have an image file named *.TAR.GZ and 
this code doesn't work, for example.

Example for magic module:

import magic
m = magic.open(magic.MAGIC_MIME)
m.load()

type = m.file("/path/to/image/file")

'type' will be something like:

'application/x-gzip; charset=binary'
'application/x-bzip2; charset=binary'
'application/x-xz; charset=binary'
'application/x-tar; charset=binary'
'application/octet-stream; charset=binary'   <- cpio gives me this, meh?

This will require separating the type checks for the compressed image 
and the datastream, but I think this approach will be more flexible down 
the road.

class AbstractSource(object):
     def __init__(self):
         self.curLen = 0
         self.totalLen = None
         self.progress = None

     def setProgress(self, progress):
         self.progress = progress

     def setTotalLen(self):
         if None not in set(x[1] for x in self.sources):
             self.totalLen = sum(x[1] for x in self.sources)

     def update(self, length):
         self.curLen += length
         self.progress.update(self.curLen, self.totalLen)

     def processDescription(self, dirname, description):
         self.sources = []
         for line in description.readlines():
             # tab-delimited:
             #   filename (relative to directory .desc is in),
             #   (optional) size in bytes (in decimal)
             filename, size = (line.strip().split('\t') + [None])[0:2]
             filename = '/'.join((dirname, filename))
             if isinstance(size, str):
                 size = int(size)
             self.sources.append((filename, size))

     def __iter__(self):
         return self

     def next(self):
         if not self.sources:
             raise StopIteration

         filename, size = self.sources.pop(0)
         archiveType, compressionType = archiveFormat(filename)
         dataSource = self.openfile(filename)
         return archiveType, compressionType, ChunkedData(dataSource, self)

class URLSource(AbstractSource):
     def __init__(self, url):
         AbstractSource.__init__(self)

         if url.endswith('.desc'):
             description = URLGrabber().urlopen(url)
             self.processDescription(os.path.dirname(url), description)
         else:
             self.sources = [(url, None)]

         # We need sizes in order to give progress during the install.
         # If the desc file is missing, or does not contain sizes, then
         # we'll get the headers twice.  Small price for simplicity, and
         # if you don't like that, create a .desc file...
         for i in range(len(self.sources)):
             if self.sources[i][1] is None:
                 h = URLGrabber().urlopen(self.sources[i][0])
                 l = h.hdr.getheader('Content-Length')
                 if l is not None:
                     t = list(self.sources[i])
                     t[1] = int(l)
                     self.sources[i] = tuple(t)
                 del h

         self.setTotalLen()

     def openfile(self, url):
         return URLGrabber().urlopen(url)

class DirectorySource(AbstractSource):
     def __init__(self, directory):
         AbstractSource.__init__(self)

         descriptions = []
         archives = []
         for dirname, dirs, files in os.walk(directory):
             descriptions.extend('/'.join((dirname, x))
                                 for x in files if x.endswith('.desc'))
             archives.extend('/'.join((dirname, x))
                             for x in files if archiveFormat(x))

         if len(descriptions)>  1:
             raise InvalidRepository(_('Only one .desc file allowed (%s)')
                                     % ' '.join(descriptions))

         if len(archives)>  1 and not descriptions:
             raise InvalidRepository(
                 _('More than one archive requires .desc file (%s)')
                 % ' '.join(archives))

         if descriptions:
             d = descriptions[0]
             self.processDescription(os.path.dirname(d), open(d))
         else:
             source = archives[0]
             size = os.stat(source).st_size
             self.sources = [(source, size)]

         for i in range(len(self.sources)):
             if self.sources[i][1] is None:
                 size = os.stat(self.sources[i][0]).st_size
                 t = list(self.sources[i])
                 t[1] = size
                 self.sources[i] = tuple(t)

         self.setTotalLen()

     def openfile(self, filename):
         return open(filename)

# http://www.chiark.greenend.org.uk/ucgi/~cjwatson/blosxom/2009-07-02-python-sigpipe.html
def subprocess_setup():
     # tar/cpio need to get SIGPIPE when gzip is done
     signal.signal(signal.SIGPIPE, signal.SIG_DFL)

class ArchiveExtractor(object):
     extractMap = {
         'tar': ['tar', 'vvvixSf', '-'],
         'cpio': ['cpio', '-ivumd']
     }
     decompressMap = {
         '': ['cat'],
         'Z': ['gzip', '-dc'],
         'gz': ['gunzip', '-dc'],
         'bz2': ['bunzip2', '-dc'],
         'xz': ['xz', '-dc'],
     }

     def __init__(self, root, compression, archiveFormat):
         self.root = root
         self.compression = compression
         self.archiveFormat = archiveFormat
         self.decompressor = self.decompressMap[compression]
         self.extractor = self.extractMap[archiveFormat]

         if not iutil.find_program_in_path(self.decompressor[0]):
             raise MissingUtility(self.decompressor[0], compression)
         if not iutil.find_program_in_path(self.extractor[0]):
             raise MissingUtility(self.extractor[0], archiveFormat)

     def open(self):
         root = self.root
         self.outlog = open(root + '/root/archiveInstall.out.log', 'a')
         self.errlog = open(root + '/root/archiveInstall.err.log', 'a')

         self.decompress = subprocess.Popen(
             self.decompressor,
             stdin=subprocess.PIPE,
             stdout=subprocess.PIPE,
             stderr=self.errlog,
             close_fds=True,
             preexec_fn=subprocess_setup,
             cwd=root)
         self.unarchive = subprocess.Popen(
             self.extractor,
             stdin=self.decompress.stdout,
             stdout=self.outlog,
             stderr=self.errlog,
             close_fds=True,
             preexec_fn=subprocess_setup,
             cwd=root)
         # http://www.enricozini.org/2009/debian/python-pipes/
         self.decompress.stdout.close()

     def write(self, data):
         self.decompress.stdin.write(data)

     def flush(self):
         self.decompress.stdin.flush()

     def close(self):
         self.flush()
         self.decompress.stdin.close()
         ec = self.unarchive.wait()
         if ec:
             raise FailedUtility(_("Failed to unpack archive"))
         ec = self.decompress.wait()
         if ec:
             raise FailedUtility(_("Failed to decompress archive"))
         self.outlog.close()
         self.errlog.close()
         self.unarchive = None
         self.decompress = None

For extraction, is there any reason we can't use Python's 'tarfile' module?

Likewise, for cpio I see there is the 'cpioarchive' module (yum install 
python-cpio), but it does not seem as functional as the cpio program.

I think we should favor Python modules over running external programs 
when possible, even if the module is really just doing the same.  The 
advantage here is that there will be other users of the Python module 
and someone else maintaining the module.  In instances where there is a 
module but it doesn't quite work for us, I think we should either (a) 
work with the module author to incorporate the changes we need or 
failing that (b) take to invoking the external tool directory, be it a 
library or console program.

def archiveSource(directory=None, url=None):
     if url:
         return URLSource(url)
     else:
         return DirectorySource(directory)

class ArchiveBackend(backend.AnacondaBackend):
     def __init__(self, anaconda):
         backend.AnacondaBackend.__init__(self, anaconda)
         self.supportsUpgrades = False
         self.supportsPackageSelection = False
         self.archiveSource = None

     def doBackendSetup(self, anaconda):
         if anaconda.dir == DISPATCH_BACK:
             return DISPATCH_BACK

         intf = anaconda.intf
         m = anaconda.methodstr

         while True:
             if m is None:
                 # long blanks make entry window longer...
                 m = intf.entryWindow(_("Please enter archive URL"), 60*" ")
             try:
                 if m.startswith('cdrom:'):
                     method, location = m.split(':', 1)
                     if not location:
                         location = '/mnt/source'
                     self._getArchiveSource(topdirectory=location)

                 else:
                     try:
                         if m.startswith('nfs:'):
                             if not network.hasActiveNetDev():
                                 if not intf.enableNetwork():
                                     raise MissingNetwork

                             (opts, server, path) = iutil.parseNfsUrl(m)
                             isys.mount(server+':'+path, '/mnt/source', 'nfs',
                                        options=opts)
                             self._getArchiveSource(directory='/mnt/source')

Is /mnt/source what we use for the install source?  I thought it was 
/mnt/install/source.  I could be wrong, but for consistency across 
install methods, we should probably make sure all these sync up.

Unrelated to this patch, but just a general improvement that could be 
made:  a patch to centralize the /mnt paths we use throughout anaconda. 
 Define them as constants somewhere.

                         elif m.startswith('http:') or m.startswith('ftp:'):
                             if not network.hasActiveNetDev():
                                 if not intf.enableNetwork():
                                     raise MissingNetwork
                             self._getArchiveSource(url=m)

                     except MissingNetwork:
                         # Keep this error in sync with yuminstall.py
                         # for translation purposes
                         rc = intf.messageWindow(_("No Network Available"),
                             _("Some of your software repositories require "
                               "networking, but there was an error enabling the "
                               "network on your system."),
                             type="custom", custom_icon="error",
                             custom_buttons = [_("_Exit installer"), _("_Back")])
                         if rc == 0:
                             sys.exit(1)
                         elif rc == 1:
                             return DISPATCH_BACK

             except (InvalidRepository, SystemError), e:
                 # SystemError on failure to mount NFS
                 rc = intf.messageWindow(_("Invalid Repository"),
                     e.message, type="custom", custom_icon="error",
                     custom_buttons = [_("_Exit installer"), _("_Retry")])
                 if rc == 0:
                         sys.exit(1)
                 m = intf.entryWindow(_("Please enter archive URL"), m)
                 continue
             return

     def _getArchiveSource(self, topdirectory=None, directory=None, url=None):
         if topdirectory:
             directory = topdirectory + '/archives/'
         s = archiveSource(directory=directory, url=url)
         self.archiveSource = s

     def doInstall(self, anaconda):
         log.info("Preparing to install archive")

         intf = anaconda.intf

         progress = intf.instProgress
         progress.set_label(_("Unpacking archive to hard drive."))
         progress.processEvents()
         progressCallback = ProgressCallback(intf.instProgress)
         self.archiveSource.setProgress(progressCallback)

         try:
             for archiveType, compressionType, inputData in self.archiveSource:
                 e = ArchiveExtractor(self.instPath, compressionType, archiveType)
                 e.open()
                 for dataChunk in inputData:
                     e.write(dataChunk)
                     e.flush()
                 try:
                     e.close()
                 except FailedUtility, e:
                     intf.messageWindow(_("Subprocess Failed"),
                         e.message, type="custom", custom_icon="error",
                         custom_buttons = [_("_Exit installer"),])
                     sys.exit(1)

         except MissingUtility, e:
             intf.messageWindow(_("Unsupported Format"),
                 _("The %s format requires the %s utility, "
                   "which is not present") % (
                     e.formatName, e.utilityName),
                 type="custom", custom_icon="error",
                 custom_buttons = [_("_Exit installer"),])
             sys.exit(1)

         intf.setInstallProgressClass(None)

     def doPostInstall(self, anaconda):
         packages.rpmSetupGraphicalSystem(anaconda)

         # now write out the "real" fstab and mtab
         anaconda.storage.write(anaconda.rootPath)

         # rebuild the initrd(s) for this hardware
         self._rebuildInitrds(anaconda)

         backend.AnacondaBackend.doPostInstall(self, anaconda)

     def _rebuildInitrds(self, anaconda):
         vers = self.kernelVersionList(anaconda.rootPath)
         for (n, arch, tag) in vers:
             packages.recreateInitrd(n, anaconda.rootPath)

     def kernelVersionList(self, rootPath = "/"):
         return packages.rpmKernelVersionList(rootPath)

Aside from my comments above, the rest of the patch I like.  Thanks for 
the patch!

--
David Cantrell <dcantrell@xxxxxxxxxx>
Supervisor, Installer Engineering Team
Red Hat, Inc. | Westford, MA | EST5EDT

_______________________________________________
Anaconda-devel-list mailing list
Anaconda-devel-list@xxxxxxxxxx
https://www.redhat.com/mailman/listinfo/anaconda-devel-list